import pandas as pd
import numpy as np
import os
import datetime
from dateutil.relativedelta import relativedelta
import re
from IPython.display import Markdown as md
from plotly import subplots
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objects as go
import plotly.express as px
from IPython.core.display import display, HTML, Javascript
import IPython.display
import string
import nltk
from nltk.stem import WordNetLemmatizer
# Load 2021 Kaggle Survey data
data_21 = pd.read_csv('./kaggle_survey_2021_responses.csv', low_memory=False)
data_21 = data_21.drop(0) # Drop row with column names
# Load job listing datasets
da = pd.read_csv("./DataAnalyst.csv")
ds = pd.read_csv("./DataScientist.csv")
# Load data for google trends on data-related fields
trend_data = pd.read_csv("./trends/trend_ds_da_ml.csv")
# Load data for google trends on data-related courses
trend_course_data = pd.read_csv("./trends/trend_courses.csv")
# CSS styling for markdown
styling = """
<style>
.main-heading{
background-color: #b2481b;
color: white !important;
font-family: Helvetica;
font-size: 32px !important;
padding: 12px 12px;
margin-bottom: 5px;
border-radius: 4px;
box-shadow: rgba(0, 0, 0, 0.19) 0px 10px 20px, rgba(0, 0, 0, 0.23) 0px 6px 6px;
}
.sub-heading{
width: auto !important;
background-color: #cd8b59;
color: white !important;
font-family: Helvetica;
font-size: 24px !important;
padding: 10px 12px;
margin-bottom: 3px;
box-shadow: rgba(0, 0, 0, 0.16) 0px 3px 6px, rgba(0, 0, 0, 0.23) 0px 3px 6px;
}
.default-font-color{
color: rgba(0,0,0,0.7) !important;
}
.highlight-orange {
background: #b2481b;
color: white;
padding: 1px 3px;
}
.highlight-cream{
background: #cd8b59;
color: white;
padding: 1px 3px;
}
.salary-diff-table tr th{
text-transform: upppercase;
}
.salary-diff-table th{
color: #444;
font-weight: bold !important;
text-transform: uppercase;
vertical-align: bottom !important;
text-align: center !important;
height: 10px !important;
padding: 0 !important;
}
.salary-diff-table th, .salary-diff-table td{
width: 120px;
height: 35px;
}
.salary-diff-table td:not(td:first-child){
padding: 15px;
font-size: 20px;
border: 2px solid white;
background: #ddd;
text-align: center;
color: #222;
}
.salary-diff-table td:first-child, .salary-diff-table th:first-child{
font-weight: bold;
width: 150px;
color: #444;
padding-right: 10px;
text-align: right !important;
text-transform: uppercase;
}
.cell-highlight-orange{
color: #efefef !important;
background: #b2481b !important;
}
.cell-highlight-black{
color: #efefef !important;
background: #222 !important;
}
.related-queries-table th{
font-size: 14px;
font-weight: bold !important;
text-align: left;
background: #B2481A !important;
border: none !important;
color: #efefef;
text-transform: uppercase;
padding: 4px;
}
.related-queries-table td:nth-child(even), .related-queries-table th:nth-child(even){
border-right: 30px solid white !important;
width: 200px;
}
.related-queries-table td{
padding: 10px 10px 10px 15px !important;
border: 2px solid white !important;
border-bottom: 1px solid #666 !important;
border-top: 1px solid #666 !important;
background: #ddd;
font-size: 15px;
}
.related-queries-table tr:nth-child(even) td{
background: #efefef !important;
}
.sidenote{
font-size: 13px;
border: 1px solid #d7d7d7;
padding: 1px 10px 2px;
box-shadow: 1px 1px 2px 1px rgba(0,0,0,0.3);
margin-bottom: 3px;
}
</style>
"""
HTML(styling)
C:\Users\Vilma\AppData\Local\Temp\ipykernel_7540\188145378.py:15: DeprecationWarning: Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython display
The WEF 2020 Future of Jobs Report lists Data Analysts and Data Scientists as the highest emerging job roles of the decade. The report, like many others, groups the professions of data scientists and analysts together, regarding them to be one and the same. This is part of the common trend that people fail to understand how the two professions differ, often them lumping both these data professions together for all intents and purposes.
A simliar situation is how the terms Artificial Intelligence(AI), deep learning, and machine learning are used interchangeably in the news and spurious marketing campaigns. A similar situation is encountered when trying to understand the difference between data scientists and data analysts.
# Colour palette and helper functions
# color palette
palette_brown = "#925632"
palette_orange = "#b2481b"
palette_cream = "#cd8b59"
palette_cream2 = "#dbb0a0"
palette_cream3 = "#e0c2c0"
annotation_text_color = "#555"
"""
function annotation_helper(...)
Helper for annotations in plotly. While reducing the amount of code to create an annotation, it also:
- Allows us to provide the text into an array of
strings(one for each new line) instead of one really long <br> separated text param
- Provides basic functionality for individual line spacing(s) between each line
- Custom annotation rectangle
- Basic debugging for annotation positioning
"""
def annotation_helper(fig, texts, x, y, line_spacing, align="left", bgcolor="rgba(0,0,0,0)", borderpad=0, ref="axes", xref="x", yref="y", width=100, debug = False):
is_line_spacing_list = isinstance(line_spacing, list)
total_spacing = 0
for index, text in enumerate(texts):
if is_line_spacing_list and index!= len(line_spacing):
current_line_spacing = line_spacing[index]
elif not is_line_spacing_list:
current_line_spacing = line_spacing
fig.add_annotation(dict(
x= x,
y= y - total_spacing,
width = width,
showarrow=False,
text= text,
align= align,
borderpad=4 if debug == False else 0, # doesn't work with new background box implementation :S
xref= "paper" if ref=="paper" else xref,
yref= "paper" if ref=="paper" else yref,
bordercolor= "#222",
borderwidth= 2 if debug == True else 0 # shows the actual borders of the annotation box
))
total_spacing += current_line_spacing
if bgcolor != "rgba(0,0,0,0)":
fig.add_shape(type="rect",
xref= "paper" if ref=="paper" else xref,
yref= "paper" if ref=="paper" else yref,
xanchor = x, xsizemode = "pixel",
x0=-width/2, x1= +width/2, y0=y + line_spacing[-1], y1=y -total_spacing,
fillcolor= bgcolor,
line = dict(width=0))
if debug == True:
handle_annot_debug(fig, x, y, ref)
# part of debug function that draws the base x and y coordinates of the annotation box.
def handle_annot_debug(fig, x, y, ref):
fig.add_shape(type="line",
xref= "paper",
yref= "y" if ref=="axes" else ref,
y0=y, y1=y, x0=-1, x1=2,
line=dict(
color="red",
width=1,
),
)
fig.add_shape(type="line",
yref= "paper",
xref= "x" if ref=="axes" else "paper",
x0=x, x1=x, y0=-1, y1=2,
line=dict(
color="red",
width=1,
),
)
Even a simple search for “data scientist vs data analyst” shows no shortage of articles which delve into this common misunderstanding. Clearly this is a problem that a lot of people seem to have...
Scouring through these 50 odd search results you’ll find multiple cases where the definitions of the two terms overlap, offering little to no clarity at times. In worse cases the two are even used as synonyms for one another. Clearly even turning to Google to give us a cut and dry answer is problematic!
One of the major reasons for this is that over the past decade terms like data analysis, have attained the status of buzzwords. With more businesses and online content creators getting involved in the hype around data science and analytics, these buzzwords are thrown around with little care for correct terminology.
# Data prep for loaded data
trend_data.columns = ["month", "data_science", "data_analysis", "machine_learning"]
trend_data["month"] = pd.to_datetime(trend_data["month"])
moving_average_window = 6;
layout = dict(
margin = dict(t=150, l=100, b=0),
xaxis = dict(showline=True, linewidth=1, linecolor="#444",tickformat="%Y", dtick="M48", range=[datetime.date(2008, 12, 1), datetime.date(2021, 10, 31)]),
yaxis = dict(showline=False, showgrid=True, gridwidth=1, gridcolor='#ddd', linecolor="#444", range=[0,105], side="right"),
showlegend = False,
width = 600,
height = 500,
plot_bgcolor= "#fff",
hoverlabel = dict(
bgcolor="white",
font_size=12
),
hovermode="x unified"
)
fig = go.Figure(layout=layout)
trend_data_da = trend_data[["month","data_analysis"]]
fig.add_trace(go.Scatter(
x= trend_data_da["month"],
y= trend_data_da["data_analysis"].rolling(moving_average_window).mean(),
mode='lines',
line= dict(color= palette_cream, width=2.3),
name='data analysis'))
trend_data_ds = trend_data[["month","data_science"]]
fig.add_trace(go.Scatter(
x= trend_data_ds["month"],
y= trend_data_ds["data_science"].rolling(moving_average_window).mean(),
mode='lines',
line= dict(color= palette_orange, width=2.3),
name='data science'))
trend_data_ml = trend_data[["month","machine_learning"]]
fig.add_trace(go.Scatter(
x= trend_data_ml["month"],
y= trend_data_ml["machine_learning"].rolling(moving_average_window).mean(),
mode='lines',
line= dict(color= "#444", width=2),
name='machine learning'))
# Annotations and title
text = [
"<span style='color:%s; font-family:Tahoma; font-size:12px'> Searches for <b style='color:%s'>data science</b> in</span>" % (annotation_text_color, palette_orange),
"<span style='color:%s; font-family:Tahoma; font-size:12px'> 2013 were almost 30 times less</span>" % (annotation_text_color),
"<span style='color:%s; font-family:Tahoma; font-size:12px'>◀ than what they are in 2021</span>" % (annotation_text_color),
]
annotation_helper(fig, text, datetime.date(2016, 7, 12), 11.4, line_spacing = [4,4,4], width= 200 )
fig.add_shape(type="rect",
x0= datetime.date(2013, 12, 20), x1= datetime.date(2017, 12, 12) + relativedelta(months=12),
y0= 0 , y1= 13,
fillcolor="white",
line_color = "white"
)
text = [
"<span style='color:%s; font-family:Tahoma; font-size:12px'><b style='color:%s'>Data analysis</b> searches</span>" % (annotation_text_color, palette_cream),
"<span style='color:%s; font-family:Tahoma; font-size:12px'>showed a steady growth</span>" % (annotation_text_color),
"<span style='color:%s; font-family:Tahoma; font-size:12px'>over the years</span>" % (annotation_text_color),
"<span style='color:%s; font-family:Tahoma; font-size:12px'>▼</span>" % (annotation_text_color),
]
annotation_helper(fig, text, datetime.date(2012, 6, 12), 44.5, line_spacing = [4,4,5], width= 200 )
text = [
"<span style='color:%s; font-family:Tahoma; font-size:12px'>Searches relative</span>" % (annotation_text_color),
"<span style='color:%s; font-family:Tahoma; font-size:12px'>to highest point</span>" % (annotation_text_color),
"<span style='color:%s; font-family:Tahoma; font-size:12px'>on graph</span>" % (annotation_text_color)
]
annotation_helper(fig, text, 1.07, 1.12, ref="paper", line_spacing = [0.037,0.037], align="right", width= 200)
text = [
"<span style='color:%s; font-family:Tahoma; font-size:14px'>Data</span>" % (palette_cream),
"<span style='color:%s; font-family:Tahoma; font-size:14px'>Analysis</span>" % (palette_cream),
"<span style='color:%s; font-family:Tahoma; font-size:14px'>Machine</span>" % (annotation_text_color),
"<span style='color:%s; font-family:Tahoma; font-size:14px'>Learning</span>" % (annotation_text_color),
"<span style='color:%s; font-family:Tahoma; font-size:14px'>Data</span>" % (palette_orange),
"<span style='color:%s; font-family:Tahoma; font-size:14px'>Science</span>" % (palette_orange)
]
annotation_helper(fig, text, -0.495, 0.26, ref="paper", line_spacing = [0.045,0.1,0.045,0.08,0.045], align="right", width= 200 )
text = [
"<span style='font-size:24px; font-family:Times New Roman;'>The rise of Data Science and Machine Learning</span>",
"<span style='font-size:13px; font-family:Helvetica'> Google searches for data science and machine learning </span>",
"<span style='font-size:13px; font-family:Helvetica'> saw a massive <b>surge in 2013 onwards</b>. </span>",
]
annotation_helper(fig, text, 1.03, 1.355, [0.10,0.05,0.05],ref="paper", width=500)
fig.show()
And indeed, searches for data science related terms have taken off over the past decade. Some of the popular related search terms include.
| Data Scientist related queries | Data Analyst related queries | ||
|---|---|---|---|
| 1 | data scientist salary | 1 | big data analytics |
| 2 | data science jobs | 2 | data analytics certificate |
| 3 | machine learning | 3 | coursera |
| 4 | data science online | 4 | data analytics meaning |
| 5 | master data science | 5 | ms data analytics |
| 6 | data science courses | 6 | how to become data analyst |
| 7 | data analyst | 7 | data analytics career |
| 8 | big data | 8 | data analysis excel 2013 |
| 9 | data science salary | 9 | mba in data analytics |
| 10 | r data science | 10 | udacity |
With degrees and online courses on both topics being incredibly popular at the moment, not having a clear understanding of what each profession entails causes further issues down the line. Students, especially those planning on pursuing them for graduate studies, should understand what skills each field will teach them and the job opportunities that will be opened up to them as a result of their chosen path.
When looking at Google trend data, we see how interest in these fields of learning has increased dramatically over the years.
# Chart of Google trends for data-related courses
trend_course_data.columns = ["month", "data_science", "data_analysis", "machine_learning"]
trend_course_data["month"] = pd.to_datetime(trend_course_data["month"])
# Moving window over which we will average the trend data
moving_average_window = 6;
layout = dict(
margin = dict(t=150,b=0),
xaxis = dict(showline=True, linewidth=1, linecolor="#444",tickformat="%Y", dtick="M48", range=[datetime.date(2010, 10, 1), datetime.date(2021, 10, 31)]),
yaxis = dict(showline=False, showgrid=True, gridwidth=1, gridcolor='#ddd', linecolor="#444", range=[0,105], side="left"),
showlegend = True,
legend = dict(
orientation="h",
traceorder="reversed",
yanchor="top",
y=1.12,
font=dict(family="Helvetica", size=14, color="rgba(0,0,0,100)"),
bgcolor = 'rgba(255,255,255,100)',
xanchor="left",
x= -0.06,
),
width = 600,
height = 500,
plot_bgcolor= "#fff",
hovermode="x unified",
hoverlabel = dict(
bgcolor="white",
font_size=12
)
)
fig = go.Figure(layout=layout)
trend_course_data_da = trend_course_data[["month","data_analysis"]]
fig.add_trace(go.Scatter(
x= trend_course_data_da["month"],
y= trend_course_data_da["data_analysis"].rolling(moving_average_window).mean(),
mode='lines',
line= dict(color= palette_cream, width=2.3),
name='data analysis',
showlegend=False)) # we use hidden traces to create the legend seen in the final graph
trend_course_data_ds = trend_course_data[["month","data_science"]]
fig.add_trace(go.Scatter(
x= trend_course_data_ds["month"],
y= trend_course_data_ds["data_science"].rolling(moving_average_window).mean(),
mode='lines',
line= dict(color= palette_orange, width=2.3),
name='data science',
showlegend=False))
trend_course_data_ml = trend_course_data[["month","machine_learning"]]
fig.add_trace(go.Scatter(
x= trend_course_data_ml["month"],
y= trend_course_data_ml["machine_learning"].rolling(moving_average_window).mean(),
mode='lines',
line= dict(color= "#444", width=2),
name='machine learning',
showlegend=False))
# These are hidden traces which I only use to create the legend entries, not necessary, the legends for bar charts just look cleaner IMO
fig.add_trace(go.Bar(x=[0], y=[1], marker=dict(color = palette_cream ), name="Data Analysis courses"))
fig.add_trace(go.Bar(x=[0], y=[1], marker=dict(color = "#444" ), name="Machine Learning"))
fig.add_trace(go.Bar(x=[0], y=[1], marker=dict(color = palette_orange), name="Data Science"))
# Annotations
text = [
"<span style='color:%s; font-family:Tahoma; font-size:12px'>Searches relative</span>" % (annotation_text_color),
"<span style='color:%s; font-family:Tahoma; font-size:12px'>to highest point</span>" % (annotation_text_color),
"<span style='color:%s; font-family:Tahoma; font-size:12px'>on graph</span>" % (annotation_text_color)
]
annotation_helper(fig, text, datetime.date(2013, 6, 12), 102.5, line_spacing = [4,4], align="left", width= 200)
# Title and subtitle for plot
text = [
"<span style='font-size:24px; font-family:Times New Roman;'>Rise in popularity of online learning courses</span>",
"<span style='font-size:13px; font-family:Helvetica'> Google searches for data science courses outpaced those for </span>",
"<span style='font-size:13px; font-family:Helvetica'> machine learning and data analysis courses around 2013 </span>",
]
annotation_helper(fig, text, 1.03, 1.375, [0.09,0.050,0.050],ref="paper", width=500)
fig.show()
So what do data analysts do? Well, in simple terms they examine large datasets, generate insights and present their findings to help organizations make better decisions. It is their job to discover, interpret and communicate meaningful patterns and trends in the data to a non-technical audience.
However the job isn’t as simple as playing around with perfect data to answer business questions. To get to this stage analysts work closely with teams to manage collection and storage of data, cleaning the obtained data, at times defining processes to automate these tasks.
# Create a list of question for the choices regarding roles at work, ie Q24
questions_list = [question for question in data_21.columns if "Q24" in question]
column_desc = ["Analyze data for business decisions", "Build data infrastructure", "Build ML Prototypes", "Build ML service for workflows","Experiment/improve exisiting models","Research to advance ML","None","Other"]
# getting a matrix: field X roles performed
roles_21 = data_21.groupby("Q5")[questions_list].agg({lambda x: x.notnull().mean()})
roles_21.columns = column_desc
mean_roles = roles_21.mean().iloc[::-1] # reversing the list just so that it shows up corectly on the graph
fig = go.Figure()
roles_21_da = roles_21[roles_21.index == "Data Analyst"]
labels = roles_21_da.columns
layout = dict(
showlegend = True,
legend = dict(
orientation="h",
traceorder="reversed",
yanchor="top",
y=1.12,
font=dict(family="Helvetica", size=14, color="rgba(0,0,0,100)"),
bgcolor = "rgba(255,255,255,100)",
xanchor="left",
x= -0.06,
),
margin = dict(t=100, l=200, b=0, pad=6),
plot_bgcolor= '#fff',
xaxis = dict(dtick = 0.2, tickformat=".0%"),
yaxis = dict(categoryorder='array', categoryarray=labels[::-1], tickfont=dict(color="#fff") ),
barmode= "overlay",
height = 450,
width = 600
)
fig.update_layout(layout)
trace = go.Bar(
x = roles_21_da.values[0],
y = labels,
width = 0.9,
marker = dict( color= [palette_orange] + [palette_cream] + ["#aaa"]*6 ),
texttemplate = [" <span style='color: #fff'>%{x:.2p}</span> "] * 6 + [" <span style='color: #222'>%{x:.1p}</span> "] * 2,
textposition = ["inside"] * 6 + ["outside"] * 2,
insidetextanchor="start",
orientation = "h",
hoverinfo = "none",
showlegend=False,
)
fig.add_trace(trace)
# Add rectangles to show survey averages
for index, row in enumerate(mean_roles.index):
fig.add_shape(
type="rect",
y0= index+0.42, y1= index-0.42,
x0= mean_roles[index], x1= mean_roles[index]+0.013,
fillcolor="white",
line = dict(color = "#444", width= 1)
)
text = [
"<b style='color:%s; font-family:Tahoma; font-size:12px'>Survey-wide average</b>" % ("#222"),
"<span style='color:%s; font-family:Tahoma; font-size:12px'>▼</span>" % ("#222")
]
annotation_helper(fig, text, 0.61, 1.10, ref="paper", line_spacing = [0.04,0.04], align="center", width= 200)
text = [
"<span style='font-size:24px; font-family:Times New Roman;'>Responsibility of Data Analysts</span>",
"<span style='font-size:14px; font-family:Helvetica'> Primary focus on discovering trends in data for </span>",
"<b style='font-size:14px; font-family:Helvetica; color: %s'> data-driven business planning </b>" % (palette_orange),
]
annotation_helper(fig, text, 1.03, 1.30, [0.095,0.055,0.055],ref="paper", width=500)
# Custom axis labels created using annotations
edited_labels = ["<span style='color:%s; font-family:Helvetica; font-size:14px'> %s </span>" % (annotation_text_color, label) for label in labels]
edited_labels[0] = "<b style='color:%s; font-family:Helvetica; font-size:14px'> Analyze data for <br>business decisions </b>" % (palette_orange)
edited_labels[1] = "<b style='color:%s; font-family:Helvetica; font-size:14px'> Build data infrastructure </b>" % (palette_cream)
annotation_helper(fig, edited_labels, -1.08, 1, [0.15, 0.125, 0.17, 0.125, 0.17, 0.125, 0.125], ref="paper", align="right", width=300)
fig.show()
fig = go.Figure()
roles_21_ds = roles_21[roles_21.index == "Data Scientist"]
labels = roles_21.columns
layout = dict(
showlegend = True,
legend = dict(
orientation="h",
traceorder="reversed",
yanchor="top",
y=1.12,
font=dict(family="Helvetica", size=14, color="rgba(0,0,0,100)"),
bgcolor = 'rgba(255,255,255,100)',
xanchor="left",
x= -0.06,
),
margin = dict(t=150, b=0, l=200, pad=6),
plot_bgcolor= '#fff',
xaxis = dict(dtick = 0.2, tickformat=".0%", range=[0,0.75]),
yaxis = dict(categoryorder='array', categoryarray = labels[::-1], tickfont=dict(color="#fff") ),
barmode= "overlay",
height = 450,
width = 600
)
fig.update_layout(layout)
trace = go.Bar(
x = roles_21_ds.values[0],
y = labels,
width = 0.9,
marker = dict( color=["#999"] * 2 + [palette_orange] * 3 + ["#999"] * 3),
texttemplate = [" <span style='color: #fff'>%{x:.2p}</span> "]*6 + [" <span style='color: #222'>%{x:.2p}</span> "]*2,
textposition = ["inside"] * 6 + ["outside"] * 2,
insidetextanchor="start",
orientation = "h",
hoverinfo = "none",
showlegend=False,
)
fig.add_trace(trace)
# Add rectangles to show survey averages
for index, row in enumerate(mean_roles.index):
fig.add_shape(
type="rect",
y0= index+0.42, y1= index-0.42,
x0= mean_roles[index], x1= mean_roles[index]+0.013,
fillcolor="white",
line = dict(color = "#222", width=0.75)
)
# ANNOTATIONS
text = [
"<b style='color:%s; font-family:Tahoma; font-size:12px'>Survey-wide average</b>" % ("#222"),
"<span style='color:%s; font-family:Tahoma; font-size:12px'>▼</span>" % (annotation_text_color)
]
annotation_helper(fig, text, 0.610, 1.125, ref="paper", line_spacing = [0.05,0.04], align="center", width= 200)
# Custom axis labels created using annotations
edited_labels = ["<span style='color:%s; font-family:Helvetica; font-size:14px'> %s </span>" % (annotation_text_color, label) for label in labels]
edited_labels[0] = "<span style='color:%s; font-family:Helvetica; font-size:14px'> Analyze data for <br>business decisions </span>" % (annotation_text_color)
edited_labels[2] = "<b style='color:%s; font-family:Helvetica; font-size:14px'> Build ML prototypes </b>" % (palette_orange)
edited_labels[3] = "<b style='color:%s; font-family:Helvetica; font-size:14px'> Build ML services for workflows </b>" % (palette_orange)
edited_labels[4] = "<b style='color:%s; font-family:Helvetica; font-size:14px'> Experiment and improve <br>existing models </b>" % (palette_orange)
annotation_helper(fig, edited_labels, -1.11, 1, [0.15, 0.125, 0.17, 0.115, 0.18, 0.125, 0.125], ref="paper", align="right", width=300)
text = [
"<span style='font-size:24px; font-family:Times New Roman;'>Responsibility of Data Scientists</span>",
"<span style='font-size:14px; font-family:Helvetica'> Includes many roles from data analysis with considerably </span>",
"<span style='font-size:14px; font-family:Helvetica'> more focus placed on <b style='color:%s'>machine learning related tasks</b>. </span>" % (palette_orange),
]
annotation_helper(fig, text, 1.03, 1.38, [0.11,0.06],ref="paper", width=500)
fig.show()
I understand that reading the above might leave many feeling like they still can’t see a clear distinction between the two professions - and the truth is that the lines are a bit blurred when it comes to data science and data analysis. Both the roles perform varying degrees of data collection, cleaning, and analysis to gain actionable insights for data-driven decision making - leaving room for a lot of overlap.
fig = go.Figure()
roles_21_ds = roles_21[roles_21.index == "Data Scientist"]
labels = roles_21.columns
# getting lower and differences between the percentages for each role, there's probably a much easier way to do this
lower_values = [roles_21_da[col].values[0] if roles_21_da[col].values[0] < roles_21_ds[col].values[0] else roles_21_ds[col].values[0] for col in roles_21_da.columns]
difference_values = [np.abs(roles_21_da[col].values[0] - roles_21_ds[col].values[0]) for col in roles_21_da.columns]
ds_higher = roles_21_da.values < roles_21_ds.values
layout = dict(
showlegend = True,
legend = dict(
orientation="h",
traceorder="reversed",
yanchor="top",
y=1.2,
xanchor="left",
x= -0.72,
font=dict(family="Helvetica", size=14, color="rgba(0,0,0,100)"),
bgcolor = 'rgba(255,255,255,100)',
),
margin = dict(t=150, b=0, l=200),
plot_bgcolor= '#fff',
xaxis = dict(dtick = 0.2, tickformat=".0%", range=[0,0.85], zeroline=True, zerolinewidth=2, zerolinecolor= annotation_text_color),
yaxis = dict(categoryorder='array', categoryarray = labels[::-1], tickfont=dict(color="#fff") ),
barmode= "stack",
height = 500,
width = 600
)
fig.update_layout(layout)
# Draw the base bar in white
trace = go.Bar(
x = lower_values,
y = labels,
width = 0.9,
marker = dict( color=["#fff"] * 8),
orientation = "h",
hoverinfo = "none",
showlegend=False,
)
fig.add_trace(trace)
# Draw the actual bars with the difference in roles percentages
trace = go.Bar(
x = difference_values,
y = labels,
width = 0.9,
marker = dict( color= ["#ddd"]*1 + [palette_cream2]*5 + ["#ddd"]*2),
orientation = "h",
hoverinfo = "none",
showlegend=False,
)
fig.add_trace(trace)
# Hidden traces for legend
fig.add_trace(go.Bar(x=[0], y=["None"], marker=dict(color = "#999" ), name="Data Analyst", orientation="h"))
fig.add_trace(go.Bar(x=[0], y=["None"], marker=dict(color = palette_orange), name="Data scientist", orientation="h"))
# Draw rectangles to denote the data points
for index, row in enumerate(mean_roles.index):
fig.add_shape(
type="rect",
y0= index+0.44, y1= index-0.44,
x0= mean_roles[index]-0.007, x1= mean_roles[index]+0.007,
fillcolor="white",
line = dict(color = "#222", width=0.75)
)
fig.add_shape(
type="rect",
y0= index+0.42, y1= index-0.42,
x0= roles_21_da[row].values[0], x1= roles_21_da[row].values[0] +0.013,
fillcolor= "#999",
line = dict(color = annotation_text_color, width=1.5)
)
fig.add_shape(
type="rect",
y0= index+0.42, y1= index-0.42,
x0= roles_21_ds[row].values[0], x1= roles_21_ds[row].values[0] +0.013,
fillcolor= palette_orange,
line = dict(color = "#222", width=1.5)
)
# ANNOTATIONS
text = [
"<b style='color:%s; font-family:Tahoma; font-size:12px'>Survey-wide average</b>" % ("#222"),
"<span style='color:%s; font-family:Tahoma; font-size:12px'>▼</span>" % (annotation_text_color)
]
annotation_helper(fig, text, 0.531, 1.125, ref="paper", line_spacing = [0.05,0.04], align="center", width= 200)
edited_labels = ["<span style='color:%s; font-family:Helvetica; font-size:14px'> %s </span>" % (annotation_text_color, label) for label in labels]
edited_labels[0] = "<span style='color:%s; font-family:Helvetica; font-size:14px'> Analyze data for <br>business decisions </span>" % (annotation_text_color)
edited_labels[1] = "<b style='color:%s; font-family:Helvetica; font-size:14px'> Build data infrastructure </b>" % (palette_orange)
edited_labels[2] = "<b style='color:%s; font-family:Helvetica; font-size:14px'> Build ML prototypes </b>" % (palette_orange)
edited_labels[3] = "<b style='color:%s; font-family:Helvetica; font-size:14px'> Build ML services for workflows </b>" % (palette_orange)
edited_labels[4] = "<b style='color:%s; font-family:Helvetica; font-size:14px'> Experiment and improve <br>existing models </b>" % (palette_orange)
edited_labels[5] = "<b style='color:%s; font-family:Helvetica; font-size:14px'> Research to advance ML </b>" % (palette_orange)
annotation_helper(fig, edited_labels, -1.06, 1, [0.15, 0.125, 0.17, 0.115, 0.18, 0.125, 0.125], ref="paper", align="right", width=300)
text = [
"<span style='font-size:24px; font-family:Times New Roman;'>Difference in Responsibilities</span>",
"<span style='font-size:13px; font-family:Helvetica'> Analysts' work is more oriented towards <b>generating business insights</b> </span>",
"<span style='font-size:13px; font-family:Helvetica'> while data scientists also perform <b style='color:%s'>other roles</b> to a greater degree </span>" % (palette_orange),
]
annotation_helper(fig, text, 1.03, 1.45, [0.11,0.055,0.055],ref="paper", width=500)
fig.show()
The main takeaway we can gain from the survey respondents is that data scientists have a greater focus on machine learning related tasks, whereas analysts have their main role as deriving insights from data and dabble in the machine learning aspects of the job.
# Converting categorical salary data into the mean of its endpoints
q25_map = {
"$0-999" : 500,
"1,000-1,999" : 1500,
"10,000-14,999" : 12500,
"30,000-39,999" : 35000,
"100,000-124,999" : 112500,
"5,000-7,499" : 6250,
"50,000-59,999" : 50000,
"40,000-49,999" : 45000,
"20,000-24,999" : 22500,
"2,000-2,999" : 2500,
"15,000-19,999" : 17500,
"7,500-9,999" : 8750,
"60,000-69,999" : 65000,
"25,000-29,999" : 27500,
"70,000-79,999" : 75000,
"4,000-4,999" : 4500,
"150,000-199,999" : 175000,
"80,000-89,999" : 85,
"3,000-3,999" : 3500,
"125,000-149,999" : 137500,
"90,000-99,999" : 95000,
"200,000-249,999" : 225000,
"300,000-499,999" : 400000,
"250,000-299,999" : 275000,
">$1,000,000" : 1250000,
"$500,000-999,999" : 750000,
'25,000-29,999': 27500,
np.NaN : np.NaN
}
data_21["Q25_num"] = data_21["Q25"].apply(lambda x: q25_map[x])
# minimum number of data analysts to include the country
data_prof_requirement = 20
# get data of all countries with more than the specified # of data professionals
da_above_requirement = (data_21[data_21["Q5"] == "Data Analyst"].groupby("Q3")["Q1"].count() > data_prof_requirement) & (data_21[data_21["Q5"] == "Data Scientist"].groupby("Q3")["Q1"].count() > data_prof_requirement)
da_above_requirement_countries = da_above_requirement[da_above_requirement == True].index
selected_countries = data_21[data_21["Q3"].isin(da_above_requirement_countries)]
selected_countries = selected_countries[ selected_countries["Q5"].isin(["Data Analyst", "Data Scientist"])] # use only data for data scientists and analysts
selected_countries = selected_countries[ selected_countries["Q25_num"] > 0] # remove placeholder number for NaNs(didn't select a salary option)
selected_countries = selected_countries.groupby(["Q3","Q5"])["Q25_num"].mean().reset_index() # aggregate by country and profession, use average salary as value
selected_countries = selected_countries.pivot(index="Q3", columns = "Q5", values="Q25_num").reset_index() # pivot to get a long-form table instead.
# adding data for global results
mean_pays = data_21[(data_21["Q25_num"]>0) & (data_21["Q5"].isin(["Data Analyst","Data Scientist"]) ) ].groupby("Q5")["Q25_num"].mean()
mean_pays_row = {"Q3": "<b>Global</b>", "Data Analyst": mean_pays[0], "Data Scientist": mean_pays[1]}
selected_countries = selected_countries.append(mean_pays_row, ignore_index=True)
selected_countries["diff"] = (selected_countries["Data Scientist"] - selected_countries["Data Analyst"])
selected_countries["pct_diff"] = selected_countries["diff"] / selected_countries["Data Analyst"]
selected_countries["ratio"] = (selected_countries["Data Scientist"]) / selected_countries["Data Analyst"]
mean_pays = data_21[data_21["Q25_num"]>0]["Q25_num"].mean()
# Data for below table
# selected_countries[selected_countries["Q3"].isin(["China", "Germany", "India", "United States of America", "France", "United Kingdom of Great Britain and Northern Ireland"])]
C:\Users\Vilma\AppData\Local\Temp\ipykernel_7540\1261704088.py:54: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
# use ipython markdown to allow us to use data from python in formatted markdown cells
markdown = """
<h3>Country-wise Salary Gap</h3>
<table class="salary-diff-table">
<tr> <th> Country </th> <th> Data Scientist </th> <th> Data Analyst </th> <th> Difference </th></tr>
"""
table_data = selected_countries[selected_countries["Q3"].isin(["China", "Germany", "India", "United States of America", "France", "United Kingdom of Great Britain and Northern Ireland"])]
for row in table_data.iterrows():
markdown += "<tr> <td> %s </td> <td class='cell-highlight-orange'> %d </td> <td> %d </td> <td> %d </td></tr>" % (row[1]["Q3"], row[1]["Data Scientist"], row[1]["Data Analyst"], row[1]["diff"])
markdown = markdown.replace("Germany </td> <td class='cell-highlight-orange'> 76574 </td> <td> 88583 </td> <td> -12008 </td>", "Germany </td> <td> 76574 </td> <td class='cell-highlight-orange'> 88583 </td> <td class='cell-highlight-black'> -12008 </td>")
markdown += """
</table>
"""
# adding some basic text in markdown before and after the table
markdown_before_table = """
<h3 class="sub-heading">Salaries</h3>
Many of us might have heard how data scientists are one of the most sought after professionals, being paid very well for their services. With the general understanding of what data scientists and analysts do being so similar, we might expect that the two are paid on the same level as one another. However this doesn’t seem to be the case - <b>data analysts make far lesser</b> than their data science wielding counterparts.
<br><br>
<div class="sidenote"> Source: Kaggle ML & DS Survey 2021 - Q25. What is your current yearly compensation? </div>
"""
markdown_after_table = """
<br>
<p class="default-font-color">In line with the previous discussion on salary, we see how the average salary estimates for these jobs
are also higher in the case of data scientists. Surely this can’t be a coincidence that we see higher
numbers when checking with two different data sources. We will return to this dataset a lot to see how
our survey responses stack up against actual jobs that are currently available. </p>
<br>
"""
md(markdown_before_table + markdown + markdown_after_table)
| Country | Data Scientist | Data Analyst | Difference |
|---|---|---|---|
| China | 19396 | 10847 | 8548 |
| France | 45581 | 27152 | 18429 |
| Germany | 76574 | 88583 | -12008 |
| India | 27660 | 11761 | 15898 |
| United Kingdom of Great Britain and Northern Ireland | 73483 | 43250 | 30233 |
| United States of America | 152812 | 74153 | 78658 |
In line with the previous discussion on salary, we see how the average salary estimates for these jobs are also higher in the case of data scientists. Surely this can’t be a coincidence that we see higher numbers when checking with two different data sources. We will return to this dataset a lot to see how our survey responses stack up against actual jobs that are currently available.
# In this section we use iPython's md function to display data from this cell as markdown
# splits the salary range into 3 columns - '0': lower bound, '1':'-', '2': upperbound
salary_da = da["Salary Estimate"].str.extract(r'([\d]+)([K$-]*)([\d]+)')
salary_da["salary"] = (salary_da[2].astype(float) + salary_da[0].astype(float)) / 2
mean_salary_da = salary_da["salary"].mean()
salary_ds = ds["Salary Estimate"].str.extract(r'([\d]+)([K$-]*)([\d]+)')
salary_ds["salary"] = (salary_ds[2].astype(float) + salary_ds[0].astype(float)) / 2
mean_salary_ds = salary_ds["salary"].mean()
# use ipython markdown to allow us to use data from python in formatted markdown cells
markdown = """
<h3>Salary Gap using Glassdoor job data</h3>
<table class="salary-diff-table">
<tr> <th> </th> <th> Data Scientist </th> <th> Data Analyst </th> <th> Difference </th></tr>
<tr> <td> Mean Pay </td> <td class="cell-highlight-orange"> %d </td> <td> %d </td> <td> %d </td></tr>
</table>
""" % (mean_salary_ds*1000, mean_salary_da*1000, (mean_salary_ds - mean_salary_da)*1000)
# adding some basic text in markdown before and after the table
markdown_before_table = """
This is all well and good when it comes to survey responses, but much like myself, many might be wondering
if this is actually how the payment scenario works in the real world. For this we look at a dataset of
glassdoor listings to get a sense of the data analyst and data scientist openings that are actually available.
<br>
<div class="sidenote"> Source: Kaggle - Data Analyst jobs, Data Scientist jobs datasets</div>
"""
markdown_after_table = """
<br>
<p class="default-font-color">In line with the previous discussion on salary, we see how the average salary estimates for these jobs
are also higher in the case of data scientists. Surely this can’t be a coincidence that we see higher
numbers when checking with two different data sources. We will return to this dataset a lot to see how
our survey responses stack up against actual jobs that are currently available. </p>
<br>
<p class="default-font-color"> As we expand our scope to include other nations we quickly see the disparity in pays across the two professions.</p>
<div class="sidenote"> Source: Kaggle ML & DS Survey 2021 - Q25. What is your current yearly compensation?
</div>
"""
md(markdown_before_table + markdown + markdown_after_table)
This is all well and good when it comes to survey responses, but much like myself, many might be wondering
if this is actually how the payment scenario works in the real world. For this we look at a dataset of
glassdoor listings to get a sense of the data analyst and data scientist openings that are actually available.
| Data Scientist | Data Analyst | Difference | |
|---|---|---|---|
| Mean Pay | 107867 | 72123 | 35744 |
In line with the previous discussion on salary, we see how the average salary estimates for these jobs are also higher in the case of data scientists. Surely this can’t be a coincidence that we see higher numbers when checking with two different data sources. We will return to this dataset a lot to see how our survey responses stack up against actual jobs that are currently available.
As we expand our scope to include other nations we quickly see the disparity in pays across the two professions.
fig = go.Figure()
# Helper functions to get lower/higher values later
def get_lower_values( df ):
if df["Data Scientist"] < df["Data Analyst"]:
return df["Data Scientist"]
if df["Data Scientist"] >= df["Data Analyst"]:
return df["Data Analyst"]
else:
return np.NaN
def get_higher_values( df ):
if df["Data Scientist"] < df["Data Analyst"]:
return df["Data Analyst"]
if df["Data Scientist"] >= df["Data Analyst"]:
return df["Data Scientist"]
else:
return np.NaN
selected_countries["lower_values"] = selected_countries.apply( lambda x: get_lower_values(x), axis=1)
selected_countries["higher_values"] = selected_countries.apply( lambda x: get_higher_values(x), axis=1)
selected_countries = selected_countries.sort_values(by=["higher_values"])
labels = selected_countries["Q3"].values
labels[-1] = "USA" # Using abbreviations to shorten the label
labels[-5] = "UK and Ireland"
ds_higher = selected_countries["diff"] > 0
layout = dict(
showlegend = True,
legend = dict(
traceorder="reversed",
yanchor="top",
y=1.14,
xanchor="left",
x= -0.04,
font=dict(family="Helvetica", size=13, color= annotation_text_color),
bgcolor = 'rgba(255,255,255,100)',
),
margin = dict(t=150, l=0, b=0, pad=6),
plot_bgcolor= '#fff',
xaxis = dict( side="top", tickformat=".0f", showgrid=True, gridcolor="#999", zeroline=True, zerolinewidth=1, zerolinecolor= "#999"),
barmode= "stack",
height = 700,
width = 550
)
fig.update_layout(layout)
trace = go.Bar(
x = selected_countries["lower_values"],
y = labels,
width = 0.9,
marker = dict( color="#fff"),
opacity=0,
orientation = "h",
hoverinfo = "none",
showlegend=False,
)
fig.add_trace(trace)
bar_colors = ["#999" if result==True else palette_orange for result in ds_higher.values]
bar_colors[19] = "#444"
trace = go.Bar(
x = np.abs(selected_countries["diff"]),
y = labels,
width = 0.85,
marker = dict( color= bar_colors, line = dict(width=1, color=bar_colors)),
orientation = "h",
hoverinfo = "none",
showlegend=False,
)
fig.add_trace(trace)
fig.add_trace(go.Bar(x=[0], y=["Egypt"], marker=dict(color = palette_orange), name="<b>Data analysts</b> are paid more", orientation="h"))
fig.add_trace(go.Bar(x=[0], y=["Egypt"], marker=dict(color = "#999" ), name="<b>Data scientists</b> are paid more", orientation="h"))
text = [
"<span style='color:%s; font-family:Tahoma; font-size:12px'>average salary</span>" % (annotation_text_color),
"<span style='color:%s; font-family:Tahoma; font-size:12px'>in USD</span>" % (annotation_text_color)
]
annotation_helper(fig, text, 1.045, 1.11, ref="paper", line_spacing = [0.0275], align="right", width= 200)
text = [
"<span style='font-size:24px; font-family:Times New Roman;'>Higher salaries in Data Science</span>",
"<span style='font-size:13px; font-family:Helvetica'> The gap in salaries of data scientists and analysts is greatly</span>",
"<span style='font-size:13px; font-family:Helvetica'> in favor of data scientists in almost all countries </span>",
]
annotation_helper(fig, text, 1.2, 1.28, [0.0575,0.03],ref="paper", width=500)
fig.show()
It is quite clear that across almost all nations, analysts are paid less than data scientists. It might be possible that a reason for this is that data scientists possess skills in machine learning - a field that is greatly sought after in the job market. We have also seen earlier how lower percentages of data analysts reported that they performed any machine learning related roles.
We look at what data analysts are paid depending on whether or not they possess each of the previous roles.
| Role | with role | without | difference |
|---|---|---|---|
| Analyse data for business decisions | 21844 | 4610 | 17234 |
| Build and manage data infrastructure | 20114 | 16433 | 3681 |
| Build ML prototypes | 29996 | 14396 | 15600 |
| Build ML services for workflows | 24589 | 16922 | 7667 |
| Improve existing ML models | 33490 | 15774 | 17716 |
| Research in ML | 16406 | 18056 | -1650 |
| None of these roles | 7277 | 18559 | -11282 |
| Other | 12665 | 18114 | -5449 |
With data scientists earn on average 50,005 USD in similar conditions, we see how none of these roles by themselves help data analysts reach this level of compensation. Data analysts that perform machine learning related tasks, namely building machine learning prototypes and improving ML models seem to be paid the highest amounts on average. In both cases, simply having these skills almost doubled their average earnings.
| Role | with role | without | difference |
|---|---|---|---|
| Build ML prototypes | 170374 | 109055 | 61319 |
| Improve existing ML models | 167579 | 132222 | 35356 |
Even for data scientists these two roles seem to make a world of difference in terms of their earnings.
This is a question that a lot of individuals entering the field ask, especially those switching careers from non-technical backgrounds. Fortunately for them, a simple google search will tell us that when it comes to data analysis, advanced coding skills aren't always necessary. Basic coding ability to wrangle data and having an understanding of analytics tools like Tableau, Power BI, etc. is often more than sufficient for most data analysis roles.
However when it comes to data science, the public opinion seems to agree that having a good command of coding is essential to make it in this field Whether its basic data preparation, analysis, modeling or writing production code, data scientists are bound to have to write code for most of their daily tasks. I will admit that it is rather odd generalisation that one field doesn't require much coding while in the other its essential.
# Data prep
coding_experience_da = data_21[data_21["Q5"]=="Data Analyst"]["Q6"].value_counts()
coding_experience_da = coding_experience_da / coding_experience_da.sum() # Convert the counts to a percentage of total
coding_experience_ds = data_21[data_21["Q5"]=="Data Scientist"]["Q6"].value_counts()
coding_experience_ds = coding_experience_ds / coding_experience_ds.sum()
# Convert coding experience labels into a shorter, formatted version for use in plotly
label_map = {
"I have never written code": "Do not<br>code",
"< 1 years": "< 1<br>years",
"1-3 years": "1-3<br>years",
"3-5 years": "3-5<br>years",
"5-10 years": "5-10<br>years",
"10-20 years": "10-20<br>years",
"20+ years": "20+<br>years"
}
# Will be used to determine the order in which the columns are drawn on the graph
ordered_labels = list(label_map.values())
coding_experience_da = data_21[data_21["Q5"]=="Data Analyst"]["Q6"].value_counts()
coding_experience_da = coding_experience_da / coding_experience_da.sum()
labels = [label_map[label] for label in coding_experience_da.index]
coding_experience_da.index = labels
# Creating graph
trace_da = go.Bar(
y = coding_experience_da,
x = coding_experience_da.index,
marker = dict( color = palette_orange, line= dict(color=palette_orange, width=1.5) ),
opacity = 0.8,
name = "Data Analyst"
)
coding_experience_ds = data_21[data_21["Q5"]=="Data Scientist"]["Q6"].value_counts()
coding_experience_ds = coding_experience_ds / coding_experience_ds.sum()
labels = [label_map[label] for label in coding_experience_ds.index]
coding_experience_ds.index = labels
trace_ds = go.Bar(
y = coding_experience_ds,
x = coding_experience_ds.index,
marker = dict(
color = "rgba(22,22,22,0)",
line= dict(color="#222", width=1.5),
pattern= dict(shape="/", size=7, solidity=0.15)
),
name = "Data Scientist",
)
layout = dict(
margin = dict(t=100, b=0, pad=5),
xaxis = dict(
categoryorder = 'array',
categoryarray = ordered_labels,
tickangle = 0,
),
yaxis = dict(
zeroline = True,
zerolinecolor = "#4d4d4d",
zerolinewidth = 1,
gridcolor = "#ccc",
tickformat = '.0%',
dtick = 0.1,
range=[0,0.45]
),
showlegend = True,
legend = dict(
orientation="h",
yanchor="top",
y=1.15,
xanchor="left",
x= -0.1,
font=dict(family="Helvetica", size=14, color="rgba(0,0,0,100)"),
bgcolor = 'rgba(255,255,255,100)',
),
bargap = 0.07,
width = 600,
height = 400,
barmode = "overlay",
plot_bgcolor = "#fff"
)
data = [trace_da, trace_ds]
fig = go.Figure(data = data, layout = layout)
fig.add_vrect(x0=-0.6, x1=2.52,
fillcolor="#ddd",
layer="below",
opacity= 1,
line_width=1
)
# ANNOTATIONS
text = [
"<b style='color:%s; font-family:Tahoma; font-size:13px'>Less than 3 years of coding</b>" % ("#333"),
"<span style='color:%s; font-family:Tahoma; font-size:11px'>65%% of <b style='color:%s'>data analysts</b> fall in this</span>" % (annotation_text_color, palette_cream),
"<span style='color:%s; font-family:Tahoma; font-size:11px'>category and 45%% for data scientists. </span>" % (annotation_text_color)
]
annotation_helper(fig, text, 1.15, 0.42, line_spacing = [0.026,0.026,0.021], width= 200 )
text = [
"<span style='color:%s; font-family:Tahoma; font-size:12px'>More <b>data scientists</b> in</span>" % (annotation_text_color),
"<span style='color:%s; font-family:Tahoma; font-size:12px'>higher experience ranges</span>" % (annotation_text_color),
"<span style='color:%s; font-family:Tahoma; font-size:12px'>▼</span>" % (annotation_text_color),
]
annotation_helper(fig, text, 5, 0.25, line_spacing = [0.025,0.025,0.021], width= 150 )
text = [
"<span style='font-size:24px; font-family:Times New Roman;'>Comparison of coding experience</span>",
"<span style='font-size:13px; font-family:Helvetica'> Data scientists in general showed more experience with </span>",
"<span style='font-size:13px; font-family:Helvetica'> writing code in their daily work </span>",
]
annotation_helper(fig, text, 1.05, 1.4, [0.10,0.06],ref="paper", width=500)
iplot(fig)
From the survey data we see that data scientists in general have more experience writing code. That said, the difference between the two isn't as drastic as the many articles would lead you to believe. For example, more data analysts have experience with SQL than data scientists.
With the majority of data professionals having less than three years of coding experience, we also see how they prefer Python and SQL when starting out on their learning journey.
The tasks that a data professional performs on a day to day basis may not always require that they have to write code. Looking at coding experience based on the role performed reveals the following insights.
# Map survey responses to a shorter formatted version
roles_map = {
'Analyze and understand data to influence product or business decisions' : "Analyze data for<br>business decisions",
'Build and/or run the data infrastructure that my business uses for storing, analyzing, and operationalizing data': "Build data <br>infrastructure",
'Build prototypes to explore applying machine learning to new areas': "Build ML <br>Prototypes",
'Build and/or run a machine learning service that operationally improves my product or workflows': "Build ML services <br>for workflows",
'Experimentation and iteration to improve existing ML models': "Experiment/improve <br>exisiting models",
'Do research that advances the state of the art of machine learning': "Research to advance <br>ML",
'None of these activities are an important part of my role at work': "None",
'Other': "Other"
}
dataprofs = data_21[(data_21["Q5"].isin(["Data Scientist", "Data Analyst"]))]
questions_roles = [question for question in data_21.columns if "Q24" in question]
count_df = pd.DataFrame(columns=['role','I have never written code', '< 1 years', '1-3 years', '3-5 years',
'5-10 years', '10-20 years', '20+ years'])
#creating table data for learning experience
for question in questions_roles:
count_data = dataprofs[dataprofs[question].notnull()]["Q6"].value_counts().reset_index()
role = dataprofs[question].dropna().unique()[0]
count_data["role"] = role
temp_df = count_data.pivot(index="role", columns='index', values='Q6')
temp_df["role"] = role
count_df = count_df.append(temp_df, ignore_index=True)
count_df = count_df.set_index("role")
count_df_perc = np.round(count_df.div(count_df.sum(axis=1).values, axis=0),3)
count_df_perc = count_df_perc[['I have never written code', '< 1 years', '1-3 years', '3-5 years', '5-10 years', '10-20 years', '20+ years']]
edited_index = [roles_map[index] for index in count_df_perc.index]
count_df_perc.index = edited_index
count_df_perc.columns = [label_map[column] for column in count_df_perc.columns]
layout = dict(
margin = dict(t=100, b=0, pad=5),
xaxis = dict(
tickangle = 0,
),
yaxis = dict(
categoryorder = 'array',
categoryarray = count_df_perc.index.tolist()[::-1],
),
width = 600,
height = 450,
coloraxis_showscale = False,
plot_bgcolor = "#fff"
)
fig = go.Figure(data=go.Heatmap(
z= count_df_perc*100,
x= count_df_perc.columns,
y= count_df_perc.index,
# custom discrete colour scale
colorscale = [[0, 'white'], [0.25, '#F1E1C2'], [0.25, '#D1B392'], [0.58, '#D1B392'], [0.58, '#B28462'], [0.75, '#B28462'], [0.75, palette_brown], [1, palette_brown]],
showscale = False,
ygap = 3,
xgap = 0.8,
),
layout = layout
)
fig.add_shape(
type="rect",
xref="x", yref="y",
x0=1.45, y0=2.49,
x1=4.525, y1=5.525,
line=dict(
color="#222",
width=1.5,
),
fillcolor="rgba(0,0,0,0)",
)
# ANNOTATIONS
text = [
"<span style='font-size:12px; font-family:Helvetica'>◀ more experience </span>",
"<span style='font-size:12px; font-family:Helvetica'> for the ML </span>",
"<span style='font-size:12px; font-family:Helvetica'> related roles </span>",
]
annotation_helper(fig, text, 5.52, 5, [0.35,0.35], bgcolor="rgba(255,255,255,0.65)", width=103)
text = [
"<span style='font-size:26px; font-family:Times New Roman;'>Coding experience for different roles</span>",
"<span style='font-size:13px; font-family:Helvetica'> The chart shows the the <b>percentage of respondents at a particular</b> </span>",
"<span style='font-size:13px; font-family:Helvetica'> <b>coding experience level</b> for each role.This helps us understand</span>",
"<span style='font-size:13px; font-family:Helvetica'> the level of coding needed to perform a role.</span>",
]
annotation_helper(fig, text, 1.05, 1.34, [0.12,0.05, 0.05],ref="paper", width=500)
fig.show()
C:\Users\Vilma\AppData\Local\Temp\ipykernel_7540\2440585364.py:30: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\Vilma\AppData\Local\Temp\ipykernel_7540\2440585364.py:30: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\Vilma\AppData\Local\Temp\ipykernel_7540\2440585364.py:30: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\Vilma\AppData\Local\Temp\ipykernel_7540\2440585364.py:30: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\Vilma\AppData\Local\Temp\ipykernel_7540\2440585364.py:30: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\Vilma\AppData\Local\Temp\ipykernel_7540\2440585364.py:30: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\Vilma\AppData\Local\Temp\ipykernel_7540\2440585364.py:30: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\Vilma\AppData\Local\Temp\ipykernel_7540\2440585364.py:30: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
A few key takeways from the above are:
This all falls in line with how the data analysis fields deal with coding, to many it is simply a means to an end. They need to be able to be able to surf through vast datasets at blazing speeds, analyse and recognise patterns and trends. This quick generation of insights may often result in sloppy or ad-hoc coding practices, which is rarely expected to be pushed to the production code.
label_map = {
"I do not use machine learning methods": "Do not<br>use ML",
"Under 1 year": "< 1<br>year",
"1-2 years": "1-2<br>years",
"2-3 years": "2-3<br>years",
"3-4 years": "3-4<br>years",
"4-5 years": "4-5<br>years",
"5-10 years": "5-10<br>years",
"10-20 years": "10-20<br>years",
"20 or more years": "20+<br>years"
}
ordered_labels = list(label_map.values())
ml_experience_da = data_21[data_21["Q5"]=="Data Analyst"]["Q15"].value_counts()
ml_experience_da = ml_experience_da / ml_experience_da.sum()
labels = [label_map[label] for label in ml_experience_da.index]
ml_experience_da.index = labels
# Creating graph
trace_da = go.Bar(
y = ml_experience_da,
x = ml_experience_da.index,
marker = dict( color = palette_cream, line= dict(color=palette_orange, width=1.5) ),
opacity = 1,
name = "Data Analyst"
)
ml_experience_ds = data_21[data_21["Q5"]=="Data Scientist"]["Q15"].value_counts()
ml_experience_ds = ml_experience_ds / ml_experience_ds.sum()
labels = [label_map[label] for label in ml_experience_ds.index]
ml_experience_ds.index = labels
trace_ds = go.Bar(
y = ml_experience_ds,
x = ml_experience_ds.index,
marker = dict(
color = "rgba(22,22,22,0)",
line= dict(color="#222", width=1.5),
pattern= dict(shape="/", size=7, solidity=0.15)
),
name = "Data Scientist",
)
layout = dict(
margin = dict(t=75, b=0, pad=5),
xaxis = dict(
categoryorder = 'array',
categoryarray = ordered_labels,
tickangle = 0,
),
yaxis = dict(
zeroline = True,
zerolinecolor = "#4d4d4d",
zerolinewidth = 1,
gridcolor = "#ccc",
tickformat = '.0%',
dtick = 0.1,
range=[0,0.45]
),
showlegend = True,
legend = dict(
orientation="h",
yanchor="top",
y=1.07,
xanchor="left",
x= -0.1,
font=dict(family="Helvetica", size=13, color="rgba(0,0,0,100)"),
bgcolor = 'rgba(255,255,255,100)',
),
bargap = 0.07,
width = 600,
height = 350,
barmode = "overlay",
plot_bgcolor = "#fff"
)
data = [trace_da, trace_ds]
fig = go.Figure(data = data, layout = layout)
text = [
"<span style='color:%s; font-family:Tahoma; font-size:12px'>More <b>data scientists</b> in</span>" % (annotation_text_color),
"<span style='color:%s; font-family:Tahoma; font-size:12px'>higher experience ranges</span>" % (annotation_text_color),
]
annotation_helper(fig, text, 6.0, 0.17, line_spacing = [0.025,0.021,0.021], width= 150 )
text = [
"<span style='font-size:24px; font-family:Times New Roman;'>Comparison of Machine Learning experience</span>",
"<span style='font-size:13px; font-family:Helvetica'> Not surprisingly, data scientists showed greater experience </span>",
"<span style='font-size:13px; font-family:Helvetica'> with using machine learning on a regular basis. </span>",
]
annotation_helper(fig, text, 1.05, 1.34, [0.12,0.06],ref="paper", width=500)
iplot(fig)
In general, machine learning tasks are often out of the scope of a data analyst’s work, resulting in them having far less hands-on experience with machine learning models as compared to their data science counterparts. However when we zoom out and look at machine learning experience in all data fields, it is surprising how low analysts rank overall.
q15_map = {
"I do not use machine learning methods" : 0,
"Under 1 year" : 0.5,
"1-2 years" : 1.5,
"2-3 years" : 2.5,
"3-4 years" : 3.5,
"4-5 years" : 4.5,
"5-10 years" : 7.5,
"10-20 years" : 15,
"20 or more years" : 25,
np.NaN : np.NaN
}
data_21["Q15_num"] = data_21["Q15"].apply(lambda x: q15_map[x])
fig = go.Figure()
ml_experience = data_21[~(data_21["Q5"].isin(["Student", "Currently not employed", "Other"]))]
ml_experience = ml_experience.groupby("Q5")["Q15_num"].mean()
ml_experience = ml_experience.sort_values(ascending=True)
labels = ml_experience.index.tolist()
labels[0] = "<b style='color:%s'>Business Analyst</b>" % palette_orange
labels[1] = "<b style='color:%s'>Data Analyst</b>" % palette_orange
labels[9] = "<b style='color:%s'>Data Scientist</b>" % palette_orange
layout = dict(
showlegend = True,
legend = dict(
orientation="h",
traceorder="reversed",
yanchor="top",
y=1.12,
font=dict(family="Helvetica", size=14, color="rgba(0,0,0,100)"),
bgcolor = 'rgba(255,255,255,100)',
xanchor="left",
x= -0.06,
),
margin = dict(t=100, l=200, b=0, pad=6),
plot_bgcolor= '#fff',
xaxis = dict(
gridcolor = "#ccc",
dtick = 1,
),
# yaxis = dict(categoryorder='array', categoryarray = labels[::-1] ),
barmode= "overlay",
height = 550,
width = 600
)
fig.update_layout(layout)
trace = go.Bar(
x = ml_experience.values,
y = labels,
width = 0.9,
marker = dict( color=[palette_orange] * 2 + ["#999"] * 7 + [palette_orange] + ["#999"] * 2),
texttemplate = [" <span style='color: #fff'>%{x:.2f}</span> "] * 13 ,
textposition = "inside",
insidetextanchor="start",
orientation = "h",
hoverinfo = "none",
showlegend=False,
)
fig.add_trace(trace)
text = [
"<b style='color:%s; font-family:Tahoma; font-size:13px'>Average years of ML experience</b>" % (annotation_text_color),
]
annotation_helper(fig, text, -0.016, 1.055, ref="paper", line_spacing = [0.032,0.032], align="left", width= 250)
text = [
"<span style='font-size:24px; font-family:Times New Roman;'>Machine Learning in data roles</span>",
"<span style='font-size:13px; font-family:Helvetica'>On average, <b style='color:%s'>analysts have the lowest machine learning experience</b></span>" % (palette_orange),
"<span style='font-size:13px; font-family:Helvetica'>among all the groups mentioned in the response data. </span>",
]
annotation_helper(fig, text, 1.03, 1.23, [0.072,0.039,0.055],ref="paper", width=500)
fig.show()
In the following chart we see how:
label_map = {
"< 1 years": "< 1<br>years",
"1-3 years": "1-3<br>years",
"3-5 years": "3-5<br>years",
"5-10 years": "5-10<br>years",
"10-20 years": "10-20<br>years",
"20+ years": "20+<br>years"
}
ordered_labels = [ "< 1<br>years", "1-3<br>years", "3-5<br>years", "5-10<br>years", "10-20<br>years", "20+<br>years" ]
dataprof_list = ["Data Scientist", "Program/Project Manager", "Data Analyst"]
fig = subplots.make_subplots(
rows=3,
cols=1,
vertical_spacing = 0.08,
subplot_titles = dataprof_list
)
layout = dict(
margin = dict(t=100, b=0, pad=5),
xaxis = dict( tickangle = 0, categoryorder = 'array', categoryarray = ordered_labels ),
xaxis2 = dict( tickangle = 0, categoryorder = 'array', categoryarray = ordered_labels ),
xaxis3 = dict( tickangle = 0, categoryorder = 'array', categoryarray = ordered_labels ),
yaxis = dict( categoryorder = 'array', categoryarray = ordered_labels ),
yaxis2 = dict( categoryorder = 'array', categoryarray = ordered_labels ),
yaxis3 = dict( categoryorder = 'array', categoryarray = ordered_labels ),
width = 500,
height = 1200,
coloraxis_showscale = False,
plot_bgcolor = "#fff"
)
for index, dataprof in enumerate(dataprof_list):
dataprofs = data_21[data_21["Q5"] == dataprof]
experience = pd.DataFrame(dataprofs.groupby("Q6")["Q15"].value_counts())
experience.columns = ["count"]
experience = experience.reset_index()
experience = experience.pivot(index="Q6", columns="Q15", values='count')
experience["< 1 years"] = experience["I do not use machine learning methods"] + experience["Under 1 year"]
experience["1-3 years"] = experience["1-2 years"] + experience["2-3 years"]
experience["3-5 years"] = experience["3-4 years"] + experience["4-5 years"]
experience["20+ years"] = experience["20 or more years"]
experience = experience[experience.index]
experience = experience.T
experience.columns = [label_map[column] for column in experience.columns]
experience.index = [label_map[column] for column in experience.index]
trace =go.Heatmap(
z= experience,
x= experience.columns,
y= experience.index,
colorscale = [[0, 'white'], [0.15, '#F1E1C2'], [0.25, '#D1B392'], [0.58, '#D1B392'], [0.58, '#B28462'], [0.75, '#B28462'], [0.75, palette_brown], [1, palette_brown]],
# colorscale = [[0, 'white'], [1, palette_brown]],
showscale = False,
ygap = 1,
xgap = 1,
)
fig.add_trace(trace, index+1, 1)
domain_number = str(index + 1) if index!=0 else ""
fig.add_shape(type="rect",
xref="x" + domain_number + " domain", yref="y" + domain_number + " domain",
x0=0, y0=0,
x1=1, y1=1,
line=dict(
color="#999",
width=0.5,
),
fillcolor="rgba(0,0,0,0)",
)
text = [
"<span style='font-size:12px; font-family:Helvetica; color: %s'> Diagonals show those that </span>" % (annotation_text_color),
"<span style='font-size:12px; font-family:Helvetica; color: %s'> learnt ML methods around</span>" % (annotation_text_color),
"<span style='font-size:12px; font-family:Helvetica; color: %s'> the same time that they </span>" % (annotation_text_color),
"<span style='font-size:12px; font-family:Helvetica; color: %s'> started coding. ▼</span>" % (annotation_text_color),
]
annotation_helper(fig, text, 1, 3.55, [0.28,0.28,0.28], xref="x", yref="y", width=150)
text = [
"<span style='font-size:12px; font-family:Helvetica; color: %s'> Below the diagonal, we find their coding </span>" % (annotation_text_color),
"<span style='font-size:12px; font-family:Helvetica; color: %s'> experience <b>exceeds their ML expertise</b></span>" % (annotation_text_color)
]
annotation_helper(fig, text, 3, 1.16, [0.28,0.28], xref="x2", yref="y2", bgcolor="rgba(255,255,255,0.6)", width=225)
# Data analysis profs have more coding experience than they have experience with machine learning
text = [
"<span style='font-size:12px; font-family:Helvetica; color: %s'> In general, data analysis professionals </span>" % (annotation_text_color),
"<span style='font-size:12px; font-family:Helvetica; color: %s'> have <b>more experience coding</b> than</span>" % (annotation_text_color),
"<span style='font-size:12px; font-family:Helvetica; color: %s'> they have practice with ML methods </span>" % (annotation_text_color),
"<span style='font-size:12px; font-family:Helvetica; color: %s'> ▼</span>" % (annotation_text_color),
]
annotation_helper(fig, text, 3, 2.6, [0.28,0.28,0.32,0], xref="x3", yref="y3", bgcolor="rgba(255,255,255,0.6)", width=225)
text = [
"<span style='font-size:13px; font-family:Helvetica; color: %s'> Machine Learning </span>" % (annotation_text_color),
"<span style='font-size:13px; font-family:Helvetica; color: %s'> Experience</span>" % (annotation_text_color),
]
annotation_helper(fig, text, 0.5, 5.17, [0.3,0.35], xref="x", yref="y", width=110)
annotation_helper(fig, text, 0.5, 5.17, [0.3,0.35], xref="x2", yref="y2", width=110)
annotation_helper(fig, text, 0.5, 5.17, [0.3,0.35], xref="x3", yref="y3", width=110)
text = [
"<span style='font-size:13px; font-family:Helvetica; color: %s'> Coding Experience </span>" % ("#222"),
]
annotation_helper(fig, text, 4.35, -0.1, [0.3,0.35], xref="x" , yref="y" , align="right", width=120)
annotation_helper(fig, text, 4.35, -0.1, [0.3,0.35], xref="x2", yref="y2", align="right", width=120)
annotation_helper(fig, text, 4.35, -0.1, [0.3,0.35], xref="x3", yref="y3", align="right", width=120)
fig.update_layout(layout)
text = [
"<span style='font-size:24px; font-family:Times New Roman;'>Coding experience for different roles</span>",
"<span style='font-size:13px; font-family:Helvetica'> The chart shows the the percentage of respondents at a particular level of coding </span>",
"<span style='font-size:13px; font-family:Helvetica'> experience for each role. </span>",
]
annotation_helper(fig, text, 1.3, 1.3, [0.10,0.06],ref="paper", width=500)
text = [
"<span style='font-size:24px; font-family:Times New Roman;'>Machine learning vs coding experience</span>",
"<span style='font-size:13px; font-family:Helvetica'>The following chart shows how coding experience and machine",
"<span style='font-size:13px; font-family:Helvetica'>learning knowledge are related in various fields. </span>",
]
annotation_helper(fig, text, 1.37, 1.095, [0.027,0.015,0.055], ref="paper", width=500)
fig.show()
Additionally, you can use the following piece to see how this plays out for the other data-related professions
ml_map = {
'I do not use machine learning methods': '< 1 years',
'Under 1 year': '< 1 years' ,
'1-2 years': '1-3 years',
'2-3 years': '1-3 years',
'3-4 years': '3-5 years',
'4-5 years': '3-5 years',
'5-10 years': '5-10 years',
'10-20 years':'10-20 years',
'20 or more years': '20+ years'
}
roles = data_21["Q5"].unique()
ml_coding_heatmap_counts = pd.DataFrame()
for role in roles:
dataprofs = data_21[data_21["Q5"] == role ]
heatmap_counts = dataprofs.groupby("Q6")["Q15"].value_counts()
heatmap_counts = heatmap_counts.rename().reset_index() # blank .rename() because it errors out without it, no clue why
heatmap_counts["field"] = role
ml_coding_heatmap_counts = ml_coding_heatmap_counts.append(heatmap_counts, ignore_index=True)
ml_coding_heatmap_counts.columns = ["coding","ml","count","field"]
ml_coding_heatmap_counts["ml"] = ml_coding_heatmap_counts["ml"].apply(lambda x: ml_map[x])
ml_coding_heatmap_counts = ml_coding_heatmap_counts.groupby(["field","coding","ml"])["count"].sum().reset_index()
ml_coding_heatmap_counts.to_csv("ml_coding_heatmap_counts.csv", index=False)
htmlt1 = '''
<head>
<style>
.dv_ml_coding {
line-height: initial !important;
border: 1px solid #d7d7d7;
width: 530px;
padding: 10px 7px 7px;
box-shadow: 2px 2px 2px 1px rgba(0, 0, 0, 0.3);
}
.ml_coding_controls {
padding-left: 30px;
margin-bottom: 5px;
}
input.ml_coding_count_check {
font-family: Helvetica, sans-serif;
border-radius: none;
font-size: 14px;
padding: 5px 10px;
background-color: #fff;
border: 1px solid #d7d7d7;
margin-bottom: 10px;
}
</style>
</head>
<body>
<img id="baseimg" src="img.png" style="display:none" />
<!-- Create a div where the graph will take place -->
<div class="dv_ml_coding">
<div class="ml_coding_controls">
<label for="field"> Role: </label>
<select name="field" class="ml_coding_field_select">
<option value = 'Business Analyst'> Business Analyst </option>
<option value = 'Currently not employed'> Currently not employed </option>
<option value = 'DBA/Database Engineer'> DBA/Database Engineer </option>
<option value = 'Data Analyst'> Data Analyst </option>
<option value = 'Data Engineer'> Data Engineer </option>
<option value = 'Data Scientist' selected> Data Scientist </option>
<option value = 'Developer Relations/Advocacy'> Developer Relations/Advocacy </option>
<option value = 'Machine Learning Engineer'> Machine Learning Engineer </option>
<option value = 'Other'> Other </option>
<option value = 'Product Manager'> Product Manager </option>
<option value = 'Program/Project Manager'> Program/Project Manager </option>
<option value = 'Research Scientist'> Research Scientist </option>
<option value = 'Software Engineer'> Software Engineer </option>
<option value = 'Statistician'> Statistician </option>
<option value = 'Student'> Student </option>
</select>
<input type="checkbox" name="show_counts" class="ml_coding_count_check"> Show counts </input>
</div>
<div class="ml_coding_title"> Title goes here </div>
<div class="ml_coding_desc"> Desc goes here </div>
<div id="dataviz_ml_coding"></div>
</div>
</body>
'''
js_t1 = '''
require.config({
paths: {
d3: "https://d3js.org/d3.v4.min"
}
});
require(["d3"], function(d3) {
var baseUrl = document.getElementById('baseimg').src.replace(/img.png.*$/, '')
var coding_median_map = {
"Business Analyst":"1-3 years",
"Currently not employed":"1-3 years",
"DBA/Database Engineer":"5-10 years",
"Data Analyst":"1-3 years",
"Data Engineer":"3-5 years",
"Data Scientist":"3-5 years",
"Developer Relations/Advocacy":"3-5 years",
"Machine Learning Engineer":"3-5 years",
"Other":"1-3 years",
"Product Manager":"3-5 years",
"Program/Project Manager":"5-10 years",
"Research Scientist":"5-10 years",
"Software Engineer":"3-5 years",
"Statistician":"1-3 years",
"Student":"1-3 years"
}
var ml_median_map = {
"Business Analyst":"Under 1 year",
"Currently not employed":"Under 1 year",
"DBA/Database Engineer":"Under 1 year",
"Data Analyst":"Under 1 year",
"Data Engineer":"1-2 years",
"Data Scientist":"1-2 years",
"Developer Relations/Advocacy":"Under 1 year",
"Machine Learning Engineer":"1-2 years",
"Other":"Under 1 year",
"Product Manager":"1-2 years",
"Program/Project Manager":"Under 1 year",
"Research Scientist":"2-3 years",
"Software Engineer":"Under 1 year",
"Statistician":"1-2 years",
"Student":"Under 1 year",
}
var count_map = {
"Business Analyst": 811,
"Currently not employed": 1871,
"DBA/Database Engineer": 154,
"Data Analyst": 2094,
"Data Engineer": 640,
"Data Scientist": 3455,
"Developer Relations/Advocacy": 88,
"Machine Learning Engineer": 1403,
"Other": 2030,
"Product Manager": 269,
"Program/Project Manager": 767,
"Research Scientist": 1421,
"Software Engineer": 2325,
"Statistician": 281,
"Student": 6145,
}
function createDesc(field){
var desc = "We see how the machine learning experience of a " + field + " varies with the number of years that they have been coding.<br>"
desc += "<br><b style='font-size:13px'>Number of respondents</b>: " + count_map[field]
desc += "<br><b style='font-size:13px'>Median coding experience</b>: " + coding_median_map[field]
desc += "<br><b style='font-size:13px'>Median machine learning experience</b>: " + ml_median_map[field]
return desc
}
// set the dimensions and margins of the graph
var margin_ml_coding = { top: 25, right: 0, bottom: 40, left: 60 },
width_ml_coding = 500 - margin_ml_coding.left - margin_ml_coding.right,
height_ml_coding = 450 - margin_ml_coding.top - margin_ml_coding.bottom;
// append the svg object to the body of the page
var svg = d3.select("#dataviz_ml_coding")
.append("svg")
.attr("width", width_ml_coding + margin_ml_coding.left + margin_ml_coding.right)
.attr("height", height_ml_coding + margin_ml_coding.top + margin_ml_coding.bottom)
.append("g")
.attr("transform",
"translate(" + margin_ml_coding.left + "," + margin_ml_coding.top + ")");
// Labels of row and columns
var xlabels = [ '< 1 years', '1-3 years', '3-5 years', '5-10 years', '10-20 years', '20+ years' ]
var ylabels = [ '< 1 years', '1-3 years', '3-5 years', '5-10 years', '10-20 years', '20+ years' ]
// var ylabels = ['Data Engineer', 'Data Scientist', 'Research Scientist', 'Statistician', 'Project Manager', 'Data Analyst', 'Software Engineer', 'ML Engineer', 'Other', 'Business Analyst', 'DBA/Database Engineer']
// Build X scales and axis:
var x = d3.scaleBand()
.range([0, width_ml_coding])
.domain(xlabels)
.padding(0.01);
svg.append("g")
.attr("class","xaxis")
.attr("transform", "translate(0," + height_ml_coding + ")")
.call(d3.axisBottom(x))
svg.select(".xaxis")
.call( g => g.append("text")
.attr("x", 375)
.attr("y", 35)
.attr("fill","#4d4d4d")
.style("font-size","14px")
.style("font-weight","bold")
.text("Coding experience"));
// Build X scales and axis:
var y = d3.scaleBand()
.range([height_ml_coding, 0])
.domain(ylabels)
.padding(0.03);
svg.append("g")
.attr("class","yaxis")
.call(d3.axisLeft(y));
svg.select(".yaxis")
.call( g => g.append("text")
.attr("x", 130)
.attr("y", 15)
.attr("fill","#4d4d4d")
.style("font-size","14px")
.style("font-weight","bold")
.text("Machine Learning")).raise();
svg.select(".yaxis")
.call( g => g.append("text")
.attr("x", 86)
.attr("y", 30)
.attr("fill","#4d4d4d")
.style("font-size","14px")
.style("font-weight","bold")
.text("Experience"));
// Build color scale
var myColor = d3.scaleLinear()
.range(["#f5f5f5", "#b2481b"])
.domain([1, 216])
showData("Data Scientist");
let ml_coding_checkbox = d3.select(".ml_coding_count_check")
let ml_coding_select = d3.select(".ml_coding_field_select")
ml_coding_checkbox
.on("change", function(){
showData( ml_coding_select.value )
})
ml_coding_select
.on("change", function(){
showData( this.value )
})
function showData( field="Data Scientist" ){
d3.csv(baseUrl + "ml_coding_heatmap_counts.csv", function (data) {
var show_counts = d3.selectAll(".ml_coding_count_check").node().checked == true ? 1: 0;
var field_data = data.filter( d => d.field == field)
function getValue(d){
if(mode == "count") return +d.count
else if(mode == "salary") return +d.sal
else return +d.prop
}
d3.select(".ml_coding_title")
.text(field)
.style("position", "relative")
.style("left", "10px")
.style("font-family", "Times New Roman")
.style("font-size", "30px")
d3.select(".ml_coding_desc")
.html(createDesc(field))
.style("padding-top", "5px")
.style("width", "500px")
.style("position", "relative")
.style("left", "10px")
.style("font-family", "Tahoma")
.style("font-size", "14px")
.style("color", "#4d4d4d")
var maxColor = d3.max(field_data, (d)=>{return +d.count })
var minColor = d3.min(field_data, (d)=>{return +d.count })
var myColor = d3.scaleLinear()
.range(["#f5f5f5", "#925632"])
.domain([minColor, maxColor])
var u = svg.selectAll("rect")
.data(field_data, function(d) {return d.ml+' '+d.coding;})
u
.enter()
.append("rect")
.style("fill", function (d) { return myColor(0); })
.style("opacity", function (d) { return 0 })
.merge(u)
.attr("x", function (d) { return x(d.coding)})
.attr("y", function (d) { return y(d.ml) })
.attr("width", x.bandwidth())
.attr("height", y.bandwidth())
.transition()
.duration(1000)
.style("fill", function (d) { if(+d.count !=="nan"){return myColor(+d.count);} return myColor(0); })
.style("opacity", function (d) { return 1 })
var v = svg.selectAll("svg > g > text")
.data(field_data, function(d) {return d.ml+' '+d.coding;})
v
.enter()
.append("text")
.style("opacity", function(d){ return 0; })
.attr("x", function (d) { return x(d.coding)})
.attr("y", function (d) { return y(d.ml) })
.attr("text-anchor","middle")
.attr("transform","translate(45,42)")
.style("font-family","Didot")
.style("font-size","24px")
.merge(v)
.transition()
.duration(1000)
.text( function (d) {return d.count; })
.style("opacity", function(d){ return show_counts; })
.style("fill", function (d){if(d.count > maxColor/2) { return "#f5f5f5"}; return "#222";})
v.exit().remove();
svg.select(".yaxis").raise();
})
}
});
'''
h = display(HTML(htmlt1))
j = IPython.display.Javascript(js_t1)
IPython.display.display_javascript(j)
C:\Users\Vilma\AppData\Local\Temp\ipykernel_7540\4236159475.py:24: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\Vilma\AppData\Local\Temp\ipykernel_7540\4236159475.py:24: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\Vilma\AppData\Local\Temp\ipykernel_7540\4236159475.py:24: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\Vilma\AppData\Local\Temp\ipykernel_7540\4236159475.py:24: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\Vilma\AppData\Local\Temp\ipykernel_7540\4236159475.py:24: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\Vilma\AppData\Local\Temp\ipykernel_7540\4236159475.py:24: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\Vilma\AppData\Local\Temp\ipykernel_7540\4236159475.py:24: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\Vilma\AppData\Local\Temp\ipykernel_7540\4236159475.py:24: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\Vilma\AppData\Local\Temp\ipykernel_7540\4236159475.py:24: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\Vilma\AppData\Local\Temp\ipykernel_7540\4236159475.py:24: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\Vilma\AppData\Local\Temp\ipykernel_7540\4236159475.py:24: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\Vilma\AppData\Local\Temp\ipykernel_7540\4236159475.py:24: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\Vilma\AppData\Local\Temp\ipykernel_7540\4236159475.py:24: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\Vilma\AppData\Local\Temp\ipykernel_7540\4236159475.py:24: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\Vilma\AppData\Local\Temp\ipykernel_7540\4236159475.py:24: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
label_map = {
"No formal education past high school": "High school",
"Some college/university study without earning a bachelor’s degree": "College/University<br>without Bachelor's<br>degree",
"Bachelor’s degree": "Bachelor’s degree",
"Master’s degree": "Master’s degree",
"Doctorate-level degree": "Doctorate-level<br>degree"
}
ordered_labels = list(label_map.values())
education_da = data_21[data_21["Q5"]=="Data Analyst"]["Q4"].value_counts()
education_da["Doctorate-level degree"] = education_da["Doctoral degree"] + education_da["Professional doctorate"]
education_da = education_da.drop(["Doctoral degree","Professional doctorate", "I prefer not to answer"])
education_da = education_da / education_da.sum()
labels = [label_map[label] for label in education_da.index]
# labels = education_da.index
education_da.index = labels
# Creating graph
trace_da = go.Bar(
x = education_da,
y = education_da.index,
orientation = "h",
marker = dict(color= palette_cream, line= dict(color=palette_orange, width=1.5)),
opacity = 1,
name = "Data Analyst"
)
education_ds = data_21[data_21["Q5"]=="Data Scientist"]["Q4"].value_counts()
education_ds["Doctorate-level degree"] = education_ds["Doctoral degree"] + education_ds["Professional doctorate"]
education_ds = education_ds.drop(["Doctoral degree", "Professional doctorate", "I prefer not to answer"])
education_ds = education_ds / education_ds.sum()
labels = [label_map[label] for label in education_ds.index]
# labels = education_ds.index
education_ds.index = labels
trace_ds = go.Bar(
x = education_ds,
y = education_ds.index,
orientation = "h",
marker = dict(
color = "rgba(255,255,255,255)",
line= dict(color="#222", width=1.5),
pattern= dict(shape="/", size=7, solidity=0.15)
),
name = "Data Scientist",
)
layout = dict(
margin = dict(t=100, b=0, pad=5),
xaxis = dict(
tickformat = ".0%",
zeroline = True,
zerolinecolor = "#4d4d4d",
zerolinewidth = 1,
gridcolor = "#ccc"
),
yaxis = dict(
categoryorder = 'array',
categoryarray = ordered_labels,
),
showlegend = True,
legend = dict(
orientation="h",
yanchor="top",
y=1.1,
xanchor="left",
x= -0.2,
font=dict(family="Helvetica", size=13, color="rgba(0,0,0,100)"),
bgcolor = 'rgba(255,255,255,0)',
),
width = 450,
height = 450,
barmode = "group",
bargroupgap = 0.1,
bargap = .1,
plot_bgcolor = "#fff"
)
data = [trace_da, trace_ds]
fig = go.Figure(data = data, layout = layout)
text = [
"<span style='font-size:24px; font-family:Times New Roman;'>Education qualifications of the Fields</span>",
"<span style='font-size:13px; font-family:Helvetica'> Data scientists show a higher percentage of respondents with</span>",
"<span style='font-size:13px; font-family:Helvetica'> doctorate level degrees. </span>",
]
annotation_helper(fig, text, 1.6, 1.32, [0.09,0.05],ref="paper", width=500)
iplot(fig)
With more than 60% of candidates having a masters degree or higher level of education, it becomes apparent that having at least a master's degree is a common route as a data scientist. Data analysts show a higher percentage of individuals that entered the field with a bachelor’s degree at the expense of far fewer pursuing doctorate-level programs.
A deep-dive into requirements on job postings for Facebook/Meta highlights another point of difference between data analysts and scientists - for data scientists having a PHD gives is not only uncommon, but is often preferred for data scientist positions.
"""
This cell contains data and code used with permission from the notebook:
"What you need to land that data job at Meta/Facebook" by Edouard Godfrey
link: https://www.reproducible-hq.com/notebook/land-that-data-job-at-meta.html
Definitely worth a read to get a deeper look into what Meta looks for in its data professionals.
"""
import pandas
jobposts = pandas.read_json("./meta_job_listings/meta_careers_dataset.json")
# Filter out post where the title could not be extracted
jobposts = jobposts[jobposts.title.apply(lambda t: isinstance(t, str))]
jobposts = jobposts.reset_index(drop=True)
# Split the sections column
def items(sections, col):
for s in sections:
if col in s['title']:
return s['items']
return []
for col in ['Responsibilities', 'Minimum Qualifications', 'Preferred Qualifications']:
jobposts[col] = jobposts.sections.apply(lambda s: items(s, col))
# Helper function to tokenize text according to a list of delimiters
import re
def tokenize(txt, delimiters):
pattern = '|'.join(map(re.escape, delimiters))
return re.split(pattern, txt)
# Create the languages column
languages = ['Julia', 'MATLAB', 'SAS', 'R', 'SQL', 'Python', 'Java', 'C++', 'C', 'C#', 'PHP']
def row_to_lg(row):
content = row.get('Minimum Qualifications', []) + \
row.get('Responsibilities', []) + \
row.get('Preferred Qualifications', [])
txt = '\n'.join(content)
delimiters = [' ', '(', ')', '/', '\n', '.', ',', ';', '"', "'", "’", '{', '}', '[', ']']
lgs = [w for w in tokenize(txt, delimiters) if w in languages]
return lgs
jobposts['languages'] = jobposts.apply(lambda r: row_to_lg(r), axis=1)
for lg in languages:
jobposts[lg] = jobposts['languages'].apply(lambda ls: lg in ls)
# Create the education column
educations = ['PhD', 'Master', 'Bachelor']
def row_to_education(row):
content = row.get('Minimum Qualifications', []) + \
row.get('Preferred Qualifications', [])
txt = '\n'.join(content)
delimiters = [' ', '(', ')', '/', '\n', ',', ';', '"', "'", "’", '{', '}', '[', ']']
words = set(tokenize(txt, delimiters))
if 'PhD' in words or 'Ph.D.' in words or 'Doctorate' in words:
return ['PhD']
elif 'M.S.' in words or 'Masters' in words or 'MS' in words or 'Master' in words or 'MSc' in words:
return ['Master']
elif 'B.S.' in words or 'Bachelors' in words or 'BS' in words or 'Bachelor' in words or 'BSc' in words or 'BSc.' in words or 'Degree' in words:
return ['Bachelor']
return []
jobposts['education'] = jobposts.apply(lambda r: row_to_education(r), axis=1)
for ed in educations:
jobposts[ed] = jobposts['education'].apply(lambda eds: ed in eds)
# Create the packages column
packages = ['pandas', 'numpy', 'theano', 'matplotlib', 'scipy', 'scikit-learn', 'pytorch', 'tensorflow', 'keras', 'luigi', 'dagster']
def row_to_py_packages(row):
content = row.get('Minimum Qualifications', []) + \
row.get('Preferred Qualifications', [])
txt = '\n'.join(content)
delimiters = [' ', '(', ')', '/', '\n', '.', ',', ';', '"', "'", "’", '{', '}', '[', ']']
pcgks = [w.lower() for w in tokenize(txt, delimiters) if w.lower() in packages]
return pcgks
jobposts['packages'] = jobposts.apply(row_to_py_packages, axis=1)
for pck in packages:
jobposts[pck] = jobposts['packages'].apply(lambda pckgs: pck in pckgs)
# Helper column to help with counting
jobposts['cnt'] = True
# Filter the dataset for Data positions
def title_of_interest(t):
t = t.lower()
if ('intern' in t) or ('manager' in t) or ('director' in t) or ('facebook university' in t):
return None
elif 'machine learning' in t and 'engineer' in t:
return 'ML Engineer'
elif 'research scientist' in t and (
'data' in t or
'machine learning' in t or
'ai' in t or
'natural language processing' in t or
'ml' in t or
'artifical intelligence' in t or
'statistic' in t):
return 'Research Scientist'
elif 'data engineer' in t:
return 'Data Engineer'
elif 'data analyst' in t:
return 'Data Analyst'
elif 'data scientist' in t:
return 'Data Scientist'
jobposts['role'] = jobposts['title'].apply(title_of_interest)
data_jobposts = jobposts[jobposts['role'].notnull()].copy()
# Data roles
roles = ['Data Analyst', 'Data Engineer', 'Data Scientist', 'Research Scientist', 'ML Engineer']
# Degrees of education
education_levels = ['Bachelor', 'Master', 'PhD']
bar_data = data_jobposts.groupby('role')[['cnt'] + education_levels].sum()
bar_data = bar_data[education_levels].div(bar_data['cnt'], axis=0)
bar_data = bar_data.reindex(roles)[education_levels]
# ===================================================
# Recreating graph from reference notebook in Plotly
fig = subplots.make_subplots(
rows=5,
cols=1,
vertical_spacing = 0.1,
subplot_titles = roles
)
for index, role in enumerate(roles):
role_data = bar_data[bar_data.index == role]
y_values = list(role_data.values[0])
index_of_max = y_values.index(max(y_values))
colors = ["#999"]*3
colors[index_of_max] = palette_orange
trace = go.Bar(
y = role_data.values[0],
x = role_data.columns,
marker = dict(
color = colors,
),
texttemplate = " <b style='color: 222; font-size:12px'>%{y:.2p}</b>",
textposition = "inside",
showlegend = False,
name = role,
)
fig.add_trace(trace, index+1, 1)
layout = dict(
margin = dict(t=100, b=0, pad=5),
xaxis = dict(
tickangle = 0,
),
xaxis2 = dict(
tickangle = 0,
),
xaxis3 = dict(
tickangle = 0,
),
xaxis4 = dict(
tickangle = 0,
),
xaxis5 = dict(
tickangle = 0,
),
yaxis = dict(
zeroline = True,
zerolinecolor = "#4d4d4d",
zerolinewidth = 2,
gridcolor = "#ccc",
tickformat = ".0%",
dtick = 0.5,
range = [0,1.05]
),
yaxis2 = dict(
zeroline = True,
zerolinecolor = "#4d4d4d",
zerolinewidth = 2,
gridcolor = "#ccc",
tickformat = ".0%",
dtick = 0.5,
range = [0,1.05]
),
yaxis3 = dict(
zeroline = True,
zerolinecolor = "#4d4d4d",
zerolinewidth = 2,
gridcolor = "#ccc",
tickformat = ".0%",
dtick = 0.5,
range = [0,1.05]
),
yaxis4 = dict(
zeroline = True,
zerolinecolor = "#4d4d4d",
zerolinewidth = 2,
gridcolor = "#ccc",
tickformat = ".0%",
dtick = 0.5,
range = [0,1.05]
),
yaxis5 = dict(
zeroline = True,
zerolinecolor = "#4d4d4d",
zerolinewidth = 2,
gridcolor = "#ccc",
tickformat = ".0%",
dtick = 0.5,
range = [0,1.05]
),
bargap = 0.02,
width = 500,
height = 800,
plot_bgcolor = "#fff"
)
fig.update_layout(layout)
text = [
"<span style='font-size:24px; font-family:Times New Roman;'>Education Preferences at Facebook (Meta)</span>",
"<span style='font-size:13px; font-family:Helvetica'>We see the proportion of job posts on Facebook Careers that specify the",
"<span style='font-size:13px; font-family:Helvetica'>degree as the <b style='color:%s'>preferred education level</b> for the role </span>" % palette_orange,
]
annotation_helper(fig, text, 1.3, 1.15, [0.044,0.025],ref="paper", width=500)
iplot(fig)
education_pays_da = data_21[(data_21["Q5"] == "Data Analyst") & (data_21["Q25_num"] > 0)].groupby("Q4")["Q25_num"].mean()
education_pays_da["Doctorate-level degree"] = education_pays_da["Doctoral degree"] + education_pays_da["Professional doctorate"]
education_pays_da = education_pays_da.drop(["Doctoral degree","Professional doctorate", "I prefer not to answer"])
labels = [label_map[label] for label in education_pays_da.index]
education_pays_da.index = labels
education_pays_ds = data_21[(data_21["Q5"] == "Data Scientist") & (data_21["Q25_num"] > 0)].groupby("Q4")["Q25_num"].mean()
education_pays_ds["Doctorate-level degree"] = education_pays_ds["Doctoral degree"] + education_pays_ds["Professional doctorate"]
education_pays_ds = education_pays_ds.drop(["Doctoral degree","Professional doctorate", "I prefer not to answer"])
labels = [label_map[label] for label in education_pays_ds.index]
education_pays_ds.index = labels
# use ipython markdown to allow us to use data from python in formatted markdown cells
markdown = """
<h3>Average salary based on highest level of education</h3>
<table class="salary-diff-table">
<tr> <th> Education </th> <th> Data Scientist </th> <th> Data Analyst </th> </tr>
"""
for label in ordered_labels[::-1]:
pay_da = education_pays_da[education_pays_da.index == label]
pay_ds = education_pays_ds[education_pays_ds.index == label]
markdown += """<tr> <td> %s </td> <td class="cell-highlight-orange"> %d </td> <td> %d </td></tr>""" % (label, pay_ds, pay_da)
markdown += " </table> "
# adding some basic text markdown before and after the table, could also just be added as separate md cells
markdown_before_table = """
Note that the chart explores the <b>preferred rather than minimum</b> requirements, so in cases where a PHD is preferred it may still be sufficient for a candidate to simply have a Master's or Bachelor's degree.
Another relevant question to ask is - “<b>do companies pay more for a degree?</b>”. We look at how average salaries vary depending on the highest degree held in each field.
Bear in mind that individuals with advanced degrees may also have more experience leading to higher pays.
<div class="sidenote">Source: Kaggle ML & DS Survey 2021<br>
- Q4. What is the highest level of formal education that you have attained or plan to attain within the next 2 years? <br>
- Q25. What is your current yearly compensation?
</div>
"""
markdown_after_table = """
"""
md(markdown_before_table + markdown + markdown_after_table)
Note that the chart explores the preferred rather than minimum requirements, so in cases where a PHD is preferred it may still be sufficient for a candidate to simply have a Master's or Bachelor's degree.
Another relevant question to ask is - “do companies pay more for a degree?”. We look at how average salaries vary depending on the highest degree held in each field. Bear in mind that individuals with advanced degrees may also have more experience leading to higher pays.
| Education | Data Scientist | Data Analyst |
|---|---|---|
| Doctorate-level degree | 131479 | 92964 |
| Master’s degree | 51963 | 29264 |
| Bachelor’s degree | 32220 | 18644 |
| College/University without Bachelor's degree | 34782 | 18252 |
| High school | 14125 | 12631 |
wordnet_lemmatizer = WordNetLemmatizer()
#defining the function to remove punctuation
def remove_punctuation(text):
punctuationfree="".join([i for i in text if i not in string.punctuation])
return punctuationfree
#defining the function for lemmatization
def lemmatizer(text):
lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
return " ".join(lemm_text)
#Stop words present in the library
stopwords = nltk.corpus.stopwords.words('english')
#defining the function to remove stopwords from tokenized text
def remove_stopwords(text):
output = [i for i in "".join(text).split(" ") if i not in stopwords]
return " ".join(output)
# Cleaning the text for data analysts
da['job_description_cleaned'] = da['Job Description'].apply(lambda x:remove_punctuation(x))
da['job_description_cleaned'] = da['job_description_cleaned'].apply(lambda x: x.lower())
da['job_description_cleaned'] = da['job_description_cleaned'].apply(lambda x:remove_stopwords(x))
da['job_description_cleaned'] = da['job_description_cleaned'].str.replace(r"(\n)+"," ", regex = True)
da['job_description_cleaned']= da['job_description_cleaned'].apply(lambda x:lemmatizer(x.split(" ")))
# Doing the same for data scientists.
ds['job_description_cleaned'] = ds['Job Description'].apply(lambda x:remove_punctuation(x))
ds['job_description_cleaned'] = ds['job_description_cleaned'].apply(lambda x: x.lower())
ds['job_description_cleaned'] = ds['job_description_cleaned'].apply(lambda x:remove_stopwords(x))
ds['job_description_cleaned'] = ds['job_description_cleaned'].str.replace(r"(\n)+"," ", regex = True)
ds['job_description_cleaned']= ds['job_description_cleaned'].apply(lambda x:lemmatizer(x.split(" ")))
# Get the word counts for both
word_counts_da = pd.DataFrame(da["job_description_cleaned"].str.split(expand=True).stack().value_counts() )
word_counts_da.columns = ["count"]
word_counts_ds = pd.DataFrame(ds["job_description_cleaned"].str.split(expand=True).stack().value_counts() )
word_counts_ds.columns = ["count"]
word_list_da = word_counts_da.index.to_list()
word_list_ds = word_counts_ds.index.to_list()
# Visualisation code
# Will return the formatted word according to its count.
def get_word_color(word, word_counts, word_list):
word = remove_punctuation(word.lower())
lemmatized_word = wordnet_lemmatizer.lemmatize(word)
higher_cutoff_count = word_counts.iloc[int(word_counts.shape[0] * 0.3 / 100)]["count"]
lower_cutoff_count = word_counts.iloc[int(word_counts.shape[0] * 1 / 100)]["count"]
if word in stopwords or word=="•":
return "color:#444;"
if lemmatized_word in word_list:
count = word_counts["count"].loc[lemmatized_word]
if count > higher_cutoff_count:
return "color: #efefef; font-weight: bold; background: #b2481b; padding: 0 1px 1px 1px; margin: 0px 1px; border-radius: 2px;"
elif count > lower_cutoff_count:
return "color: #cd8b59; font-weight: bold; text-shadow: 0 0 5px rgba(205,139, 89, 0.0);"
elif count <= lower_cutoff_count:
return "color:#444"
else:
return "color:#444"
# Format the description data based on word counts
def create_formatted_md(sentences, word_counts, word_list):
sentences = sentences.replace("\n"," <br> ")
formatting_list = []
for word in re.split(' |\n', sentences):
color = get_word_color(word, word_counts, word_list)
formatted_md = "<span style='%s'> %s </span>" % (color, word)
formatting_list.append(formatted_md)
output = " ".join(formatting_list)
output = output
return output
# testing if it works..
# md(create_formatted_md(da["Job Description"].iloc[5], word_counts_da, word_list_da))
--------------------------------------------------------------------------- LookupError Traceback (most recent call last) File C:\Python310\lib\site-packages\nltk\corpus\util.py:84, in LazyCorpusLoader.__load(self) 83 try: ---> 84 root = nltk.data.find(f"{self.subdir}/{zip_name}") 85 except LookupError: File C:\Python310\lib\site-packages\nltk\data.py:583, in find(resource_name, paths) 582 resource_not_found = f"\n{sep}\n{msg}\n{sep}\n" --> 583 raise LookupError(resource_not_found) LookupError: ********************************************************************** Resource stopwords not found. Please use the NLTK Downloader to obtain the resource: >>> import nltk >>> nltk.download('stopwords') For more information see: https://www.nltk.org/data.html Attempted to load corpora/stopwords.zip/stopwords/ Searched in: - 'C:\\Users\\Vilma/nltk_data' - 'C:\\Python310\\nltk_data' - 'C:\\Python310\\share\\nltk_data' - 'C:\\Python310\\lib\\nltk_data' - 'C:\\Users\\Vilma\\AppData\\Roaming\\nltk_data' - 'C:\\nltk_data' - 'D:\\nltk_data' - 'E:\\nltk_data' ********************************************************************** During handling of the above exception, another exception occurred: LookupError Traceback (most recent call last) Input In [22], in <cell line: 14>() 11 return " ".join(lemm_text) 13 #Stop words present in the library ---> 14 stopwords = nltk.corpus.stopwords.words('english') 16 #defining the function to remove stopwords from tokenized text 17 def remove_stopwords(text): File C:\Python310\lib\site-packages\nltk\corpus\util.py:121, in LazyCorpusLoader.__getattr__(self, attr) 118 if attr == "__bases__": 119 raise AttributeError("LazyCorpusLoader object has no attribute '__bases__'") --> 121 self.__load() 122 # This looks circular, but its not, since __load() changes our 123 # __class__ to something new: 124 return getattr(self, attr) File C:\Python310\lib\site-packages\nltk\corpus\util.py:86, in LazyCorpusLoader.__load(self) 84 root = nltk.data.find(f"{self.subdir}/{zip_name}") 85 except LookupError: ---> 86 raise e 88 # Load the corpus. 89 corpus = self.__reader_cls(root, *self.__args, **self.__kwargs) File C:\Python310\lib\site-packages\nltk\corpus\util.py:81, in LazyCorpusLoader.__load(self) 79 else: 80 try: ---> 81 root = nltk.data.find(f"{self.subdir}/{self.__name}") 82 except LookupError as e: 83 try: File C:\Python310\lib\site-packages\nltk\data.py:583, in find(resource_name, paths) 581 sep = "*" * 70 582 resource_not_found = f"\n{sep}\n{msg}\n{sep}\n" --> 583 raise LookupError(resource_not_found) LookupError: ********************************************************************** Resource stopwords not found. Please use the NLTK Downloader to obtain the resource: >>> import nltk >>> nltk.download('stopwords') For more information see: https://www.nltk.org/data.html Attempted to load corpora/stopwords Searched in: - 'C:\\Users\\Vilma/nltk_data' - 'C:\\Python310\\nltk_data' - 'C:\\Python310\\share\\nltk_data' - 'C:\\Python310\\lib\\nltk_data' - 'C:\\Users\\Vilma\\AppData\\Roaming\\nltk_data' - 'C:\\nltk_data' - 'D:\\nltk_data' - 'E:\\nltk_data' **********************************************************************
words_da = word_counts_da[:30] / da.shape[0]
words_ds = word_counts_ds[:30] / ds.shape[0]
def style_words_in_both_lists(word):
if word in words_da.index and word in words_ds.index:
return "style=' color: #efefef !important; background: #b2481b !important; '"
else:
return "style='color:black;'"
# use ipython markdown to allow us to use data from python in formatted markdown cells
markdown = """
<h3>Most common words in job descriptions</h3>
<b> Key </b> - <span class='highlight-orange'>Words which show up in top 20 for both lists</span>
<br> <br>
<table class="related-queries-table">
<tr style='border: 2px solid white'> <th style='border: 2px solid white !important'> Data Analyst </th> <th> count / num_postings </th> <th style='border: 2px solid white !important'> Data Scientist </th> <th> count / num_postings </th> </tr>
"""
for index in range(20):
markdown += """<tr> <td %s> %s </td> <td> %.2f </td> <td %s> %s </td> <td> %.2f </td> </tr>
""" % ( style_words_in_both_lists(words_da.index[index]), words_da.index[index], np.round(words_da.iloc[index].values[0],2),
style_words_in_both_lists(words_ds.index[index]), words_ds.index[index], np.round(words_ds.iloc[index].values[0],2))
markdown += " </table> "
# adding some basic text markdown before and after the table, could also just be added as separate md cells
markdown_before_table = """
When looking at the content of the job descriptions, we see that in most cases it is very difficult to tell whether the position is for a data scientist or an analyst.
Even though the job descriptions are meant for different posts, we see a major overlap in the commonly used words as shown below.
<div class="sidenote"> Source: Kaggle - Data Analyst jobs, Data Scientist jobs datasets</div>
"""
markdown_after_table = """
"""
md(markdown_before_table + markdown + markdown_after_table)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Input In [23], in <cell line: 1>() ----> 1 words_da = word_counts_da[:30] / da.shape[0] 2 words_ds = word_counts_ds[:30] / ds.shape[0] 4 def style_words_in_both_lists(word): NameError: name 'word_counts_da' is not defined
When focusing on the actual job descriptions, you can look around for yourself in the following interactive piece. Certain keywords are highlighted based on where they rank in terms of their frequency in the job listings.
# Write the cleaned and formatted data.
da['job_description_formatted'] = da['Job Description'].apply(lambda x:create_formatted_md(x, word_counts_da, word_list_da))
ds['job_description_formatted'] = ds['Job Description'].apply(lambda x:create_formatted_md(x, word_counts_ds, word_list_ds))
job_exploration_data = da.append(ds, ignore_index=True)
columns_required = ["Job Title", "Industry", "Company Name", "Job Description", "job_description_formatted"]
job_exploration_data[columns_required].to_csv("job_exploration_data.csv", index=False)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Input In [24], in <cell line: 2>() 1 # Write the cleaned and formatted data. ----> 2 da['job_description_formatted'] = da['Job Description'].apply(lambda x:create_formatted_md(x, word_counts_da, word_list_da)) 3 ds['job_description_formatted'] = ds['Job Description'].apply(lambda x:create_formatted_md(x, word_counts_ds, word_list_ds)) 5 job_exploration_data = da.append(ds, ignore_index=True) File C:\Python310\lib\site-packages\pandas\core\series.py:4433, in Series.apply(self, func, convert_dtype, args, **kwargs) 4323 def apply( 4324 self, 4325 func: AggFuncType, (...) 4328 **kwargs, 4329 ) -> DataFrame | Series: 4330 """ 4331 Invoke function on values of Series. 4332 (...) 4431 dtype: float64 4432 """ -> 4433 return SeriesApply(self, func, convert_dtype, args, kwargs).apply() File C:\Python310\lib\site-packages\pandas\core\apply.py:1082, in SeriesApply.apply(self) 1078 if isinstance(self.f, str): 1079 # if we are a string, try to dispatch 1080 return self.apply_str() -> 1082 return self.apply_standard() File C:\Python310\lib\site-packages\pandas\core\apply.py:1137, in SeriesApply.apply_standard(self) 1131 values = obj.astype(object)._values 1132 # error: Argument 2 to "map_infer" has incompatible type 1133 # "Union[Callable[..., Any], str, List[Union[Callable[..., Any], str]], 1134 # Dict[Hashable, Union[Union[Callable[..., Any], str], 1135 # List[Union[Callable[..., Any], str]]]]]"; expected 1136 # "Callable[[Any], Any]" -> 1137 mapped = lib.map_infer( 1138 values, 1139 f, # type: ignore[arg-type] 1140 convert=self.convert_dtype, 1141 ) 1143 if len(mapped) and isinstance(mapped[0], ABCSeries): 1144 # GH#43986 Need to do list(mapped) in order to get treated as nested 1145 # See also GH#25959 regarding EA support 1146 return obj._constructor_expanddim(list(mapped), index=obj.index) File C:\Python310\lib\site-packages\pandas\_libs\lib.pyx:2870, in pandas._libs.lib.map_infer() Input In [24], in <lambda>(x) 1 # Write the cleaned and formatted data. ----> 2 da['job_description_formatted'] = da['Job Description'].apply(lambda x:create_formatted_md(x, word_counts_da, word_list_da)) 3 ds['job_description_formatted'] = ds['Job Description'].apply(lambda x:create_formatted_md(x, word_counts_ds, word_list_ds)) 5 job_exploration_data = da.append(ds, ignore_index=True) NameError: name 'create_formatted_md' is not defined
htmlt1 = '''
<head>
<style>
.job_description_container {
box-shadow: rgba(0, 0, 0, 0.16) 0px 3px 6px, rgba(0, 0, 0, 0.23) 0px 3px 6px;
margin-top: 5px;
padding:10px;
border: 1px solid #999;
}
</style>
</head>
<body>
<img id="baseimg" src="img.png" style="display:none" />
<label for="key"> Term to search for:</label> <br>
<input type="text" name="key" class="job_description_searchbox" value="education">
<input type="button" class="job_description_search_button" value="Search">
<div class="job_description_container">
<div class="job_description_header">
Header text will go here
</div>
<hr>
<div class="job_description_body">
Body text will go here
</div>
</div>
</body>
'''
js_t1 = '''
require.config({
paths: {
d3: "https://d3js.org/d3.v4.min"
}
});
require(["d3"], function(d3) {
var baseUrl = document.getElementById('baseimg').src.replace(/img.png.*$/, '')
d3.csv(baseUrl + "job_exploration_data.csv", function (data) {
var container = d3.select(".job_description_container")
var container_body = container.select(".job_description_body")
var container_header = container.select(".job_description_header")
let searchbox = d3.select(".job_description_searchbox")
let search_button = d3.select(".job_description_search_button")
showData(searchbox.node().value);
search_button
.on("click", function(){
showData( searchbox.node().value )
})
function getRandomInt(min, max) {
var min = Math.ceil(min);
var max = Math.floor(max);
return Math.floor(Math.random() * (max - min) + min); //The maximum is exclusive and the minimum is inclusive
}
function showData( key ){
var filtered_data = data.filter( d => d["Job Description"].toLowerCase().includes(key.toLowerCase()) )
if (filtered_data.length == 0){
container_header.html("");
container_body.html("No results found")
}
else{
var random_index = getRandomInt(0, filtered_data.length)
var d = filtered_data[random_index]
var header = "<div class='job_description_header'>"
header += " <b> Number of results </b>: " + filtered_data.length + " <hr>"
header += " <b> Company Name </b>: " + d["Company Name"] + " <br>"
header += " <b> Industry </b>: " + d["Industry"] + " <br>"
header += " <b> Job Title </b>: " + d["Job Title"] + " <br>"
header += "</div>"
container_header.html(header)
var body = d["job_description_formatted"]
if(key.length >= 5)
{
var searchMask = key;
var regEx = new RegExp(searchMask, "ig");
var replaceMask = "<b style='background:black !important; color:white; padding: 5px; '>"+key+"</b>";
body = body.replace(regEx, replaceMask);
}
container_body.html(body)
}
}
})
})
'''
h = display(HTML(htmlt1))
j = IPython.display.Javascript(js_t1)
IPython.display.display_javascript(j)
In the end, no single role can do it all by themselves and it requires a team effort to make your organisation's data work for you.
# Applying the CSS styling to all previous cells
HTML(styling)